View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.fileupload.util.mime;
18  
19  import java.io.ByteArrayOutputStream;
20  import java.io.UnsupportedEncodingException;
21  import java.util.Base64;
22  import java.util.HashMap;
23  import java.util.Locale;
24  import java.util.Map;
25  
26  /**
27   * Utility class to decode MIME texts.
28   *
29   * @since 1.3
30   */
31  public final class MimeUtility {
32  
33      /**
34       * The {@code US-ASCII} charset identifier constant.
35       */
36      private static final String US_ASCII_CHARSET = "US-ASCII";
37  
38      /**
39       * The marker to indicate text is encoded with BASE64 algorithm.
40       */
41      private static final String BASE64_ENCODING_MARKER = "B";
42  
43      /**
44       * The marker to indicate text is encoded with QuotedPrintable algorithm.
45       */
46      private static final String QUOTEDPRINTABLE_ENCODING_MARKER = "Q";
47  
48      /**
49       * If the text contains any encoded tokens, those tokens will be marked with "=?".
50       */
51      private static final String ENCODED_TOKEN_MARKER = "=?";
52  
53      /**
54       * If the text contains any encoded tokens, those tokens will terminate with "=?".
55       */
56      private static final String ENCODED_TOKEN_FINISHER = "?=";
57  
58      /**
59       * The linear whitespace chars sequence.
60       */
61      private static final String LINEAR_WHITESPACE = " \t\r\n";
62  
63      /**
64       * Mappings between MIME and Java charset.
65       */
66      private static final Map<String, String> MIME2JAVA = new HashMap<>();
67  
68      static {
69          MIME2JAVA.put("iso-2022-cn", "ISO2022CN");
70          MIME2JAVA.put("iso-2022-kr", "ISO2022KR");
71          MIME2JAVA.put("utf-8", "UTF8");
72          MIME2JAVA.put("utf8", "UTF8");
73          MIME2JAVA.put("ja_jp.iso2022-7", "ISO2022JP");
74          MIME2JAVA.put("ja_jp.eucjp", "EUCJIS");
75          MIME2JAVA.put("euc-kr", "KSC5601");
76          MIME2JAVA.put("euckr", "KSC5601");
77          MIME2JAVA.put("us-ascii", "ISO-8859-1");
78          MIME2JAVA.put("x-us-ascii", "ISO-8859-1");
79      }
80  
81      /**
82       * Decode a string of text obtained from a mail header into
83       * its proper form.  The text generally will consist of a
84       * string of tokens, some of which may be encoded using
85       * base64 encoding.
86       *
87       * @param text   The text to decode.
88       * @return The decoded text string.
89       * @throws UnsupportedEncodingException if the detected encoding in the input text is not supported.
90       */
91      public static String decodeText(final String text) throws UnsupportedEncodingException {
92          // if the text contains any encoded tokens, those tokens will be marked with "=?".  If the
93          // source string doesn't contain that sequent, no decoding is required.
94          if (!text.contains(ENCODED_TOKEN_MARKER)) {
95              return text;
96          }
97  
98          int offset = 0;
99          final int endOffset = text.length();
100 
101         int startWhiteSpace = -1;
102         int endWhiteSpace = -1;
103 
104         final StringBuilder decodedText = new StringBuilder(text.length());
105 
106         boolean previousTokenEncoded = false;
107 
108         while (offset < endOffset) {
109             char ch = text.charAt(offset);
110 
111             // is this a whitespace character?
112             if (LINEAR_WHITESPACE.indexOf(ch) != -1) { // whitespace found
113                 startWhiteSpace = offset;
114                 while (offset < endOffset) {
115                     // step over the white space characters.
116                     ch = text.charAt(offset);
117                     if (LINEAR_WHITESPACE.indexOf(ch) == -1) {
118                         // record the location of the first non lwsp and drop down to process the
119                         // token characters.
120                         endWhiteSpace = offset;
121                         break;
122                     }
123                     offset++;
124                 }
125             } else {
126                 // we have a word token.  We need to scan over the word and then try to parse it.
127                 final int wordStart = offset;
128 
129                 while (offset < endOffset) {
130                     // step over the non white space characters.
131                     ch = text.charAt(offset);
132                     if (LINEAR_WHITESPACE.indexOf(ch) != -1) {
133                         break;
134                     }
135                     offset++;
136 
137                     //NB:  Trailing whitespace on these header strings will just be discarded.
138                 }
139                 // pull out the word token.
140                 final String word = text.substring(wordStart, offset);
141                 // is the token encoded?  decode the word
142                 if (word.startsWith(ENCODED_TOKEN_MARKER)) {
143                     try {
144                         // if this gives a parsing failure, treat it like a non-encoded word.
145                         final String decodedWord = decodeWord(word);
146 
147                         // are any whitespace characters significant?  Append 'em if we've got 'em.
148                         if (!previousTokenEncoded && startWhiteSpace != -1) {
149                             decodedText.append(text, startWhiteSpace, endWhiteSpace);
150                             startWhiteSpace = -1;
151                         }
152                         // this is definitely a decoded token.
153                         previousTokenEncoded = true;
154                         // and add this to the text.
155                         decodedText.append(decodedWord);
156                         // we continue parsing from here...we allow parsing errors to fall through
157                         // and get handled as normal text.
158                         continue;
159 
160                     } catch (final ParseException e) {
161                         // just ignore it, skip to next word
162                     }
163                 }
164                 // this is a normal token, so it doesn't matter what the previous token was.  Add the white space
165                 // if we have it.
166                 if (startWhiteSpace != -1) {
167                     decodedText.append(text, startWhiteSpace, endWhiteSpace);
168                     startWhiteSpace = -1;
169                 }
170                 // this is not a decoded token.
171                 previousTokenEncoded = false;
172                 decodedText.append(word);
173             }
174         }
175 
176         return decodedText.toString();
177     }
178 
179     /**
180      * Parse a string using the RFC 2047 rules for an "encoded-word"
181      * type.  This encoding has the syntax:
182      *
183      * encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
184      *
185      * @param word   The possibly encoded word value.
186      * @return The decoded word.
187      * @throws ParseException in case of a parse error of the RFC 2047
188      * @throws UnsupportedEncodingException Thrown when Invalid RFC 2047 encoding was found
189      */
190     private static String decodeWord(final String word) throws ParseException, UnsupportedEncodingException {
191         // encoded words start with the characters "=?".  If this not an encoded word, we throw a
192         // ParseException for the caller.
193 
194         if (!word.startsWith(ENCODED_TOKEN_MARKER)) {
195             throw new ParseException("Invalid RFC 2047 encoded-word: " + word);
196         }
197 
198         final int charsetPos = word.indexOf('?', 2);
199         if (charsetPos == -1) {
200             throw new ParseException("Missing charset in RFC 2047 encoded-word: " + word);
201         }
202 
203         // pull out the character set information (this is the MIME name at this point).
204         final String charset = word.substring(2, charsetPos).toLowerCase(Locale.ROOT);
205 
206         // now pull out the encoding token the same way.
207         final int encodingPos = word.indexOf('?', charsetPos + 1);
208         if (encodingPos == -1) {
209             throw new ParseException("Missing encoding in RFC 2047 encoded-word: " + word);
210         }
211 
212         final String encoding = word.substring(charsetPos + 1, encodingPos);
213 
214         // and finally the encoded text.
215         final int encodedTextPos = word.indexOf(ENCODED_TOKEN_FINISHER, encodingPos + 1);
216         if (encodedTextPos == -1) {
217             throw new ParseException("Missing encoded text in RFC 2047 encoded-word: " + word);
218         }
219 
220         final String encodedText = word.substring(encodingPos + 1, encodedTextPos);
221 
222         // seems a bit silly to encode a null string, but easy to deal with.
223         if (encodedText.isEmpty()) {
224             return "";
225         }
226 
227         try {
228             // the decoder writes directly to an output stream.
229             final ByteArrayOutputStream out = new ByteArrayOutputStream(encodedText.length());
230 
231             final byte[] encodedData = encodedText.getBytes(US_ASCII_CHARSET);
232 
233             // Base64 encoded?
234             if (encoding.equals(BASE64_ENCODING_MARKER)) {
235                 out.write(Base64.getDecoder().decode(encodedData));
236             } else if (encoding.equals(QUOTEDPRINTABLE_ENCODING_MARKER)) { // maybe quoted printable.
237                 QuotedPrintableDecoder.decode(encodedData, out);
238             } else {
239                 throw new UnsupportedEncodingException("Unknown RFC 2047 encoding: " + encoding);
240             }
241             // get the decoded byte data and convert into a string.
242             final byte[] decodedData = out.toByteArray();
243             return new String(decodedData, javaCharset(charset));
244         } catch (final Exception e) {
245             throw new UnsupportedEncodingException("Invalid RFC 2047 encoding");
246         }
247     }
248 
249     /**
250      * Translate a MIME standard character set name into the Java
251      * equivalent.
252      *
253      * @param charset The MIME standard name.
254      * @return The Java equivalent for this name.
255      */
256     private static String javaCharset(final String charset) {
257         // nothing in, nothing out.
258         if (charset == null) {
259             return null;
260         }
261 
262         final String mappedCharset = MIME2JAVA.get(charset.toLowerCase(Locale.ROOT));
263         // if there is no mapping, then the original name is used.  Many of the MIME character set
264         // names map directly back into Java.  The reverse isn't necessarily true.
265         if (mappedCharset == null) {
266             return charset;
267         }
268         return mappedCharset;
269     }
270 
271     /**
272      * Hidden constructor, this class must not be instantiated.
273      */
274     private MimeUtility() {
275         // do nothing
276     }
277 
278 }