001 /** 002 * ======================================== 003 * JFreeReport : a free Java report library 004 * ======================================== 005 * 006 * Project Info: http://reporting.pentaho.org/ 007 * 008 * (C) Copyright 2000-2007, by Object Refinery Limited, Pentaho Corporation and Contributors. 009 * 010 * This library is free software; you can redistribute it and/or modify it under the terms 011 * of the GNU Lesser General Public License as published by the Free Software Foundation; 012 * either version 2.1 of the License, or (at your option) any later version. 013 * 014 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 015 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 016 * See the GNU Lesser General Public License for more details. 017 * 018 * You should have received a copy of the GNU Lesser General Public License along with this 019 * library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, 020 * Boston, MA 02111-1307, USA. 021 * 022 * [Java is a trademark or registered trademark of Sun Microsystems, Inc. 023 * in the United States and other countries.] 024 * 025 * ------------ 026 * $Id: CSVTokenizer.java,v 1.9 2007/04/01 18:49:33 taqua Exp $ 027 * ------------ 028 * (C) Copyright 2000-2005, by Object Refinery Limited. 029 * (C) Copyright 2005-2007, by Pentaho Corporation. 030 */ 031 package org.jfree.report.util; 032 033 import java.util.Enumeration; 034 import java.util.NoSuchElementException; 035 036 /** 037 * The csv tokenizer class allows an application to break a Comma Separated Value format 038 * into tokens. The tokenization method is much simpler than the one used by the 039 * <code>StringTokenizer</code> class. The <code>CSVTokenizer</code> methods do not 040 * distinguish among identifiers, numbers, and quoted strings, nor do they recognize and 041 * skip comments. 042 * <p/> 043 * The set of separator (the characters that separate tokens) may be specified either at 044 * creation time or on a per-token basis. 045 * <p/> 046 * An instance of <code>CSVTokenizer</code> behaves in one of two ways, depending on 047 * whether it was created with the <code>returnSeparators</code> flag having the value 048 * <code>true</code> or <code>false</code>: <ul> <li>If the flag is <code>false</code>, 049 * delimiter characters serve to separate tokens. A token is a maximal sequence of 050 * consecutive characters that are not separator. <li>If the flag is <code>true</code>, 051 * delimiter characters are themselves considered to be tokens. A token is thus either one 052 * delimiter character, or a maximal sequence of consecutive characters that are not 053 * separator. </ul><p> A <tt>CSVTokenizer</tt> object internally maintains a current 054 * position within the string to be tokenized. Some operations advance this current 055 * position past the characters processed.<p> A token is returned by taking a substring of 056 * the string that was used to create the <tt>CSVTokenizer</tt> object. 057 * <p/> 058 * The following is one example of the use of the tokenizer. The code: 059 * <blockquote><pre> 060 * CSVTokenizer csvt = new CSVTokenizer("this,is,a,test"); 061 * while (csvt.hasMoreTokens()) { 062 * println(csvt.nextToken()); 063 * } 064 * </pre></blockquote> 065 * <p/> 066 * prints the following output: 067 * <blockquote><pre> 068 * this 069 * is 070 * a 071 * test 072 * </pre></blockquote> 073 * 074 * @author abupon 075 */ 076 public class CSVTokenizer implements Enumeration 077 { 078 /** 079 * The complete record that should be separated into elements. 080 */ 081 private String record; 082 /** 083 * The separator. 084 */ 085 private String separator; 086 /** 087 * The quoting char. 088 */ 089 private String quate; 090 091 /** 092 * the current parsing position. 093 */ 094 private int currentIndex; 095 096 private boolean beforeStart; 097 098 /** 099 * A possible separator constant. 100 */ 101 public static final String SEPARATOR_COMMA = ","; 102 /** 103 * A possible separator constant. 104 */ 105 public static final String SEPARATOR_TAB = "\t"; 106 /** 107 * A possible separator constant. 108 */ 109 public static final String SEPARATOR_SPACE = " "; 110 111 /** 112 * A possible quote character constant. 113 */ 114 public static final String DOUBLE_QUATE = "\""; 115 /** 116 * A possible quote character constant. 117 */ 118 public static final String SINGLE_QUATE = "'"; 119 120 /** 121 * Constructs a csv tokenizer for the specified string. <code>theSeparator</code> 122 * argument is the separator for separating tokens. 123 * <p/> 124 * If the <code>returnSeparators</code> flag is <code>true</code>, then the separator 125 * string is also returned as tokens. separator is returned as a string. If the flag is 126 * <code>false</code>, the separator string is skipped and only serve as separator 127 * between tokens. 128 * 129 * @param aString a string to be parsed. 130 * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, 131 * CSVTokenizer.SPACE, etc.). 132 * @param theQuate the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE, 133 * etc.). 134 */ 135 public CSVTokenizer (final String aString, final String theSeparator, 136 final String theQuate) 137 { 138 if (aString == null) 139 { 140 throw new NullPointerException("The given string is null"); 141 } 142 if (theSeparator == null) 143 { 144 throw new NullPointerException("The given separator is null"); 145 } 146 if (theQuate == null) 147 { 148 throw new NullPointerException("The given quate is null"); 149 } 150 this.record = aString.trim(); 151 this.separator = theSeparator; 152 this.quate = theQuate; 153 this.currentIndex = 0; 154 this.beforeStart = true; 155 } 156 157 /** 158 * Constructs a csv tokenizer for the specified string. The characters in the 159 * <code>theSeparator</code> argument are the separator for separating tokens. Separator 160 * string themselves will not be treated as tokens. 161 * 162 * @param aString a string to be parsed. 163 * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, 164 * CSVTokenizer.SPACE, etc.). 165 */ 166 public CSVTokenizer (final String aString, final String theSeparator) 167 { 168 this(aString, theSeparator, CSVTokenizer.DOUBLE_QUATE); 169 } 170 171 /** 172 * Constructs a string tokenizer for the specified string. The tokenizer uses the 173 * default separator set, which is <code>CSVTokenizer.SEPARATOR_COMMA</code>. Separator 174 * string themselves will not be treated as tokens. 175 * 176 * @param aString a string to be parsed. 177 */ 178 public CSVTokenizer (final String aString) 179 { 180 this(aString, CSVTokenizer.SEPARATOR_COMMA); 181 } 182 183 /** 184 * Tests if there are more tokens available from this tokenizer's string. If this method 185 * returns <tt>true</tt>, then a subsequent call to <tt>nextToken</tt> with no argument 186 * will successfully return a token. 187 * 188 * @return <code>true</code> if and only if there is at least one token in the string 189 * after the current position; <code>false</code> otherwise. 190 */ 191 public boolean hasMoreTokens () 192 { 193 return (this.currentIndex < this.record.length()); 194 } 195 196 /** 197 * Returns the next token from this string tokenizer. 198 * 199 * @return the next token from this string tokenizer. 200 * 201 * @throws NoSuchElementException if there are no more tokens in this tokenizer's 202 * string. 203 * @throws IllegalArgumentException if given parameter string format was wrong 204 */ 205 public String nextToken () 206 throws NoSuchElementException, IllegalArgumentException 207 { 208 209 if (!this.hasMoreTokens()) 210 { 211 throw new NoSuchElementException(); 212 } 213 String token; 214 215 if (beforeStart == false) 216 { 217 currentIndex += this.separator.length(); 218 } 219 else 220 { 221 beforeStart = false; 222 } 223 224 if (this.record.startsWith(this.quate, this.currentIndex)) 225 { 226 String rec = this.record.substring(this.currentIndex + this.quate.length()); 227 token = ""; 228 for (; ;) 229 { 230 final int end = rec.indexOf(this.quate); 231 if (end < 0) 232 { 233 throw new IllegalArgumentException("Illegal format"); 234 } 235 236 if (!rec.startsWith(this.quate, end + 1)) 237 { 238 token += rec.substring(0, end); 239 break; 240 } 241 token = token + rec.substring(0, end + 1); 242 rec = rec.substring(end + this.quate.length() * 2); 243 this.currentIndex++; 244 } 245 246 this.currentIndex += (token.length() + this.quate.length() * 2); 247 } 248 else 249 { 250 final int end = this.record.indexOf(this.separator, this.currentIndex); 251 if (end >= 0) 252 { 253 final int start = this.currentIndex; 254 token = this.record.substring(start, end); 255 this.currentIndex = end; 256 } 257 else 258 { 259 final int start = this.currentIndex; 260 token = this.record.substring(start); 261 this.currentIndex = this.record.length(); 262 } 263 } 264 265 return token; 266 } 267 268 /** 269 * Returns the next token in this string tokenizer's string. First, the set of 270 * characters considered to be separator by this <tt>CSVTokenizer</tt> object is changed 271 * to be the characters in the string <tt>separator</tt>. Then the next token in the 272 * string after the current position is returned. The current position is advanced 273 * beyond the recognized token. The new delimiter set remains the default after this 274 * call. 275 * 276 * @param theSeparator the new separator. 277 * @return the next token, after switching to the new delimiter set. 278 * 279 * @throws java.util.NoSuchElementException 280 * if there are no more tokens in this tokenizer's string. 281 */ 282 public String nextToken (final String theSeparator) 283 { 284 separator = theSeparator; 285 return nextToken(); 286 } 287 288 /** 289 * Returns the same value as the <code>hasMoreTokens</code> method. It exists so that 290 * this class can implement the <code>Enumeration</code> interface. 291 * 292 * @return <code>true</code> if there are more tokens; <code>false</code> otherwise. 293 * 294 * @see java.util.Enumeration 295 * @see org.jfree.report.util.CSVTokenizer#hasMoreTokens() 296 */ 297 public boolean hasMoreElements () 298 { 299 return hasMoreTokens(); 300 } 301 302 /** 303 * Returns the same value as the <code>nextToken</code> method, except that its declared 304 * return value is <code>Object</code> rather than <code>String</code>. It exists so 305 * that this class can implement the <code>Enumeration</code> interface. 306 * 307 * @return the next token in the string. 308 * 309 * @throws java.util.NoSuchElementException 310 * if there are no more tokens in this tokenizer's string. 311 * @see java.util.Enumeration 312 * @see org.jfree.report.util.CSVTokenizer#nextToken() 313 */ 314 public Object nextElement () 315 { 316 return nextToken(); 317 } 318 319 /** 320 * Calculates the number of times that this tokenizer's <code>nextToken</code> method 321 * can be called before it generates an exception. The current position is not 322 * advanced. 323 * 324 * @return the number of tokens remaining in the string using the current delimiter 325 * set. 326 * 327 * @see org.jfree.report.util.CSVTokenizer#nextToken() 328 */ 329 public int countTokens () 330 { 331 int count = 0; 332 333 final int preserve = this.currentIndex; 334 final boolean preserveStart = this.beforeStart; 335 while (this.hasMoreTokens()) 336 { 337 this.nextToken(); 338 count++; 339 } 340 this.currentIndex = preserve; 341 this.beforeStart = preserveStart; 342 343 return count; 344 } 345 346 /** 347 * Returns the quate. 348 * 349 * @return char 350 */ 351 public String getQuate () 352 { 353 return this.quate; 354 } 355 356 /** 357 * Sets the quate. 358 * 359 * @param quate The quate to set 360 */ 361 public void setQuate (final String quate) 362 { 363 this.quate = quate; 364 } 365 }