001/* 002 * Copyright (C) 2012 Christian Schulte <cs@schulte.it> 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without 006 * modification, are permitted provided that the following conditions 007 * are met: 008 * 009 * o Redistributions of source code must retain the above copyright 010 * notice, this list of conditions and the following disclaimer. 011 * 012 * o Redistributions in binary form must reproduce the above copyright 013 * notice, this list of conditions and the following disclaimer in 014 * the documentation and/or other materials provided with the 015 * distribution. 016 * 017 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 018 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 019 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 020 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT, 021 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 022 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 023 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 024 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 025 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 026 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 027 * 028 * $JOMC: JavaIdentifier.java 5106 2016-04-04 19:56:25Z schulte $ 029 * 030 */ 031package org.jomc.jls; 032 033import java.io.Serializable; 034import java.lang.ref.Reference; 035import java.lang.ref.SoftReference; 036import java.text.MessageFormat; 037import java.text.ParseException; 038import java.util.ArrayList; 039import java.util.HashMap; 040import java.util.List; 041import java.util.Locale; 042import java.util.Map; 043import java.util.ResourceBundle; 044 045/** 046 * Data type of a Java identifier. 047 * <p> 048 * This class provides support for parsing and normalizing text to java identifiers as specified in the Java 049 * Language Specification - Java SE 7 Edition - Chapter 3.8ff. 050 * </p> 051 * 052 * @author <a href="mailto:cs@schulte.it">Christian Schulte</a> 053 * @version $JOMC: JavaIdentifier.java 5106 2016-04-04 19:56:25Z schulte $ 054 * @see #normalize(java.lang.String, org.jomc.jls.JavaIdentifier.NormalizationMode) 055 * @see #parse(java.lang.String) 056 * @see #valueOf(java.lang.String) 057 */ 058public final class JavaIdentifier implements CharSequence, Serializable 059{ 060 061 /** 062 * Normalization modes. 063 * 064 * @author <a href="mailto:cs@schulte.it">Christian Schulte</a> 065 * @version $JOMC: JavaIdentifier.java 5106 2016-04-04 19:56:25Z schulte $ 066 * @see JavaIdentifier#normalize(java.lang.String, org.jomc.jls.JavaIdentifier.NormalizationMode) 067 */ 068 public static enum NormalizationMode 069 { 070 071 /** 072 * Mode to normalize by compacting words using camel-case. 073 */ 074 CAMEL_CASE, 075 /** 076 * Mode to normalize by separating words using '_' and by converting all characters to lower-case. 077 */ 078 LOWER_CASE, 079 /** 080 * Mode to normalize by separating words using '_' and by converting all characters to upper-case. 081 */ 082 UPPER_CASE, 083 /** 084 * Mode to normalize according to the 085 * <cite>Code Conventions for the Java Programming Language - 9 - Naming Conventions - Constants</cite>. 086 * <blockquote> 087 * The names of variables declared class constants and of ANSI constants should be all uppercase with words 088 * separated by underscores ("_"). (ANSI constants should be avoided, for ease of debugging.) 089 * </blockquote> 090 */ 091 CONSTANT_NAME_CONVENTION, 092 /** 093 * Mode to normalize according to the 094 * <cite>Code Conventions for the Java Programming Language - 9 - Naming Conventions - Methods</cite>. 095 * <blockquote> 096 * Methods should be verbs, in mixed case with the first letter lowercase, with the first letter of each 097 * internal word capitalized. 098 * </blockquote> 099 */ 100 METHOD_NAME_CONVENTION, 101 /** 102 * Mode to normalize according to the 103 * <cite>Code Conventions for the Java Programming Language - 9 - Naming Conventions - Variables</cite>. 104 * <blockquote> 105 * Except for variables, all instance, class, and class constants are in mixed case with a lowercase first 106 * letter. Internal words start with capital letters. Variable names should not start with underscore _ or 107 * dollar sign $ characters, even though both are allowed. Variable names should be short yet meaningful. The 108 * choice of a variable name should be mnemonic - that is - designed to indicate to the casual observer the 109 * intent of its use. One-character variable names should be avoided except for temporary "throwaway" variables. 110 * Common names for temporary variables are i, j, k, m, and n for integers; c, d, and e for characters. 111 * </blockquote> 112 */ 113 VARIABLE_NAME_CONVENTION 114 115 } 116 117 /** 118 * The value of the instance. 119 * 120 * @serial 121 */ 122 private String identifier; 123 124 /** 125 * Cached instances. 126 */ 127 private static volatile Reference<Map<CacheKey, JavaIdentifier>> cache; 128 129 /** 130 * Serial version UID for backwards compatibility with 7.x object streams. 131 */ 132 private static final long serialVersionUID = 7639783770152985285L; 133 134 /** 135 * Underscore character. 136 */ 137 private static final int UNDERSCORE_CODEPOINT = Character.codePointAt( "_", 0 ); 138 139 /** 140 * Creates a new {@code JavaIdentifier} instance. 141 */ 142 private JavaIdentifier() 143 { 144 super(); 145 } 146 147 /** 148 * Returns the length of this character sequence. 149 * 150 * @return The number of {@code char}s in this sequence. 151 */ 152 public int length() 153 { 154 return this.identifier.length(); 155 } 156 157 /** 158 * Returns the {@code char} value at a given index. 159 * 160 * @param index The index of the {@code char} value to return. 161 * 162 * @return The {@code char} value at {@code index}. 163 * 164 * @throws IndexOutOfBoundsException if {@code index} is negative or not less than the length of the sequence. 165 */ 166 public char charAt( final int index ) 167 { 168 return this.identifier.charAt( index ); 169 } 170 171 /** 172 * Returns a new {@code CharSequence} that is a subsequence of this sequence. 173 * 174 * @param start The start index, inclusive. 175 * @param end The end index, exclusive. 176 * 177 * @return The sequence of characters starting at index {@code start} up to index {@code end - 1}. 178 * 179 * @throws IndexOutOfBoundsException if {@code start} or {@code end} are negative, if {@code end} is greater than 180 * the length of the sequence, or if {@code start} is greater than {@code end}. 181 */ 182 public CharSequence subSequence( final int start, final int end ) 183 { 184 return this.identifier.subSequence( start, end ); 185 } 186 187 /** 188 * Returns a string containing the characters in this sequence in the same order as this sequence. The length of the 189 * string will be the length of this sequence. 190 * 191 * @return A string consisting of exactly this sequence of characters. 192 */ 193 @Override 194 public String toString() 195 { 196 return this.identifier; 197 } 198 199 /** 200 * Returns the hash-code value of the object. 201 * 202 * @return The hash-code value of the object. 203 */ 204 @Override 205 public int hashCode() 206 { 207 return this.identifier.hashCode(); 208 } 209 210 /** 211 * Tests whether some other object is equal to the object. 212 * 213 * @param o The object to test. 214 * 215 * @return {@code true}, if {@code o} is an instance of the class of the object and its string value is equal to the 216 * string value of the object. 217 */ 218 @Override 219 public boolean equals( final Object o ) 220 { 221 boolean equal = o == this; 222 223 if ( !equal && o instanceof JavaIdentifier ) 224 { 225 equal = this.toString().equals( o.toString() ); 226 } 227 228 return equal; 229 } 230 231 /** 232 * Normalizes text from the beginning of the given string to produce a {@code JavaIdentifier}. 233 * 234 * @param text The text to normalize. 235 * @param mode The normalization to apply. 236 * 237 * @return A {@code JavaIdentifier} instance constructed by normalizing {@code text} according to {@code mode}. 238 * 239 * @throws NullPointerException if {@code text} or {@code mode} is {@code null}. 240 * @throws ParseException if normalization fails. 241 */ 242 public static JavaIdentifier normalize( final String text, final NormalizationMode mode ) throws ParseException 243 { 244 if ( text == null ) 245 { 246 throw new NullPointerException( "text" ); 247 } 248 if ( mode == null ) 249 { 250 throw new NullPointerException( "mode" ); 251 } 252 253 return parse( text, mode, false ); 254 } 255 256 /** 257 * Parses text from the beginning of a given string to produce a {@code JavaIdentifier} instance. 258 * 259 * @param text The text to parse. 260 * 261 * @return A {@code JavaIdentifier} instance constructed by parsing {@code text}. 262 * 263 * @throws NullPointerException if {@code text} is {@code null}. 264 * @throws ParseException if parsing fails. 265 * 266 * @see #valueOf(java.lang.String) 267 */ 268 public static JavaIdentifier parse( final String text ) throws ParseException 269 { 270 if ( text == null ) 271 { 272 throw new NullPointerException( "text" ); 273 } 274 275 return parse( text, null, false ); 276 } 277 278 /** 279 * Parses text from the beginning of a given string to produce a {@code JavaIdentifier} instance. 280 * <p> 281 * Unlike the {@link #parse(String)} method, this method throws an {@code IllegalArgumentException} if parsing 282 * fails. 283 * </p> 284 * 285 * @param text The text to parse. 286 * 287 * @return A {@code JavaIdentifier} instance constructed by parsing {@code text}. 288 * 289 * @throws NullPointerException if {@code text} is {@code null}. 290 * @throws IllegalArgumentException if parsing fails. 291 * 292 * @see #parse(java.lang.String) 293 */ 294 public static JavaIdentifier valueOf( final String text ) throws IllegalArgumentException 295 { 296 if ( text == null ) 297 { 298 throw new NullPointerException( "text" ); 299 } 300 301 try 302 { 303 return parse( text, null, true ); 304 } 305 catch ( final ParseException e ) 306 { 307 throw new AssertionError( e ); 308 } 309 } 310 311 private static JavaIdentifier parse( final String text, final NormalizationMode mode, 312 final boolean runtimeException ) 313 throws ParseException 314 { 315 Map<CacheKey, JavaIdentifier> map = cache == null ? null : cache.get(); 316 317 if ( map == null ) 318 { 319 map = new HashMap<CacheKey, JavaIdentifier>( 128 ); 320 cache = new SoftReference<Map<CacheKey, JavaIdentifier>>( map ); 321 } 322 323 synchronized ( map ) 324 { 325 final CacheKey key = new CacheKey( text, mode ); 326 JavaIdentifier javaIdentifier = map.get( key ); 327 328 if ( javaIdentifier == null ) 329 { 330 javaIdentifier = new JavaIdentifier(); 331 parseIdentifier( javaIdentifier, text, mode, runtimeException ); 332 333 if ( mode != null ) 334 { 335 final CacheKey normalizedKey = new CacheKey( javaIdentifier.toString(), mode ); 336 final JavaIdentifier normalizedInstance = map.get( normalizedKey ); 337 338 if ( normalizedInstance != null ) 339 { 340 map.put( key, normalizedInstance ); 341 javaIdentifier = normalizedInstance; 342 } 343 else 344 { 345 map.put( key, javaIdentifier ); 346 map.put( normalizedKey, javaIdentifier ); 347 } 348 } 349 else 350 { 351 map.put( key, javaIdentifier ); 352 } 353 } 354 355 return javaIdentifier; 356 } 357 } 358 359 private static void parseIdentifier( final JavaIdentifier t, final String text, final NormalizationMode mode, 360 final boolean runtimeException ) 361 throws ParseException 362 { 363 if ( text.length() <= 0 ) 364 { 365 if ( runtimeException ) 366 { 367 throw new IllegalArgumentException( getMessage( "invalidEmptyString" ) ); 368 } 369 else 370 { 371 throw new ParseException( getMessage( "invalidEmptyString" ), 0 ); 372 } 373 } 374 375 final StringBuilder identifierBuilder = new StringBuilder( text.length() ); 376 final List<Integer> retainedIndices = new ArrayList<Integer>( text.length() ); 377 boolean start_of_word = true; 378 int words = 0; 379 380 for ( int i = 0, j = 1, s0 = text.length(), last_codepoint = -1; i < s0; i++, j++ ) 381 { 382 if ( !isWordSeparator( text.codePointAt( i ), mode, identifierBuilder.length() <= 0 ) ) 383 { 384 if ( mode != null ) 385 { 386 switch ( mode ) 387 { 388 case CAMEL_CASE: 389 if ( start_of_word ) 390 { 391 identifierBuilder.append( Character.toUpperCase( text.charAt( i ) ) ); 392 } 393 else if ( last_codepoint > -1 && j < s0 394 && isCamelCase( last_codepoint, text.codePointAt( i ), 395 text.codePointAt( j ) ) ) 396 { // Retain camel-case in words. 397 identifierBuilder.append( text.charAt( i ) ); 398 retainedIndices.add( identifierBuilder.length() - 1 ); 399 } 400 else 401 { 402 identifierBuilder.append( Character.toLowerCase( text.charAt( i ) ) ); 403 } 404 break; 405 406 case LOWER_CASE: 407 if ( start_of_word && last_codepoint > -1 && last_codepoint != UNDERSCORE_CODEPOINT ) 408 { 409 identifierBuilder.append( Character.toChars( UNDERSCORE_CODEPOINT ) ); 410 } 411 412 identifierBuilder.append( Character.toLowerCase( text.charAt( i ) ) ); 413 break; 414 415 case UPPER_CASE: 416 case CONSTANT_NAME_CONVENTION: 417 if ( start_of_word && last_codepoint > -1 && last_codepoint != UNDERSCORE_CODEPOINT ) 418 { 419 identifierBuilder.append( Character.toChars( UNDERSCORE_CODEPOINT ) ); 420 } 421 422 identifierBuilder.append( Character.toUpperCase( text.charAt( i ) ) ); 423 break; 424 425 case VARIABLE_NAME_CONVENTION: 426 case METHOD_NAME_CONVENTION: 427 if ( start_of_word ) 428 { 429 identifierBuilder.append( words == 0 430 ? Character.toLowerCase( text.charAt( i ) ) 431 : Character.toUpperCase( text.charAt( i ) ) ); 432 433 } 434 else if ( last_codepoint > -1 && j < s0 435 && isCamelCase( last_codepoint, text.codePointAt( i ), 436 text.codePointAt( j ) ) ) 437 { // Retain camel-case in words. 438 identifierBuilder.append( text.charAt( i ) ); 439 retainedIndices.add( identifierBuilder.length() - 1 ); 440 } 441 else 442 { 443 identifierBuilder.append( Character.toLowerCase( text.charAt( i ) ) ); 444 } 445 break; 446 447 default: 448 throw new AssertionError( mode ); 449 450 } 451 } 452 else 453 { 454 identifierBuilder.append( text.charAt( i ) ); 455 } 456 457 last_codepoint = identifierBuilder.codePointAt( identifierBuilder.length() - 1 ); 458 start_of_word = false; 459 } 460 else 461 { 462 if ( mode != null ) 463 { 464 if ( !start_of_word ) 465 { 466 start_of_word = true; 467 words++; 468 } 469 } 470 else if ( runtimeException ) 471 { 472 throw new IllegalArgumentException( getMessage( "invalidCharacter", text, text.charAt( i ), i ) ); 473 } 474 else 475 { 476 throw new ParseException( getMessage( "invalidCharacter", text, text.charAt( i ), i ), i ); 477 } 478 } 479 } 480 481 if ( words > 0 ) 482 { 483 // Multiple words - no camel-case retained in any word. 484 toLowerCase( identifierBuilder, retainedIndices ); 485 } 486 487 t.identifier = identifierBuilder.toString(); 488 489 if ( t.identifier.length() <= 0 ) 490 { 491 if ( runtimeException ) 492 { 493 throw new IllegalArgumentException( getMessage( "invalidCharacters", text ) ); 494 } 495 else 496 { 497 throw new ParseException( getMessage( "invalidCharacters", text ), 0 ); 498 } 499 } 500 501 if ( JavaLanguage.KEYWORDS.contains( t.identifier ) 502 || JavaLanguage.BOOLEAN_LITERALS.contains( t.identifier ) 503 || JavaLanguage.NULL_LITERAL.equals( t.identifier ) ) 504 { 505 if ( mode != null ) 506 { 507 t.identifier = "_" + t.identifier; 508 } 509 else if ( runtimeException ) 510 { 511 throw new IllegalArgumentException( getMessage( "invalidWord", text, t.identifier, 512 text.indexOf( t.identifier ) ) ); 513 514 } 515 else 516 { 517 throw new ParseException( getMessage( "invalidWord", text, t.identifier, text.indexOf( t.identifier ) ), 518 text.indexOf( t.identifier ) ); 519 520 } 521 } 522 } 523 524 private static boolean isWordSeparator( final int codePoint, final NormalizationMode mode, final boolean first ) 525 { 526 return !( ( first ? Character.isJavaIdentifierStart( codePoint ) : Character.isJavaIdentifierPart( codePoint ) ) 527 && ( mode != null ? Character.isLetterOrDigit( codePoint ) : true ) ); 528 529 } 530 531 private static boolean isCamelCase( final int left, final int middle, final int right ) 532 { 533 return Character.isLowerCase( left ) && Character.isUpperCase( middle ) && Character.isLowerCase( right ); 534 } 535 536 private static void toLowerCase( final StringBuilder stringBuilder, final List<Integer> indices ) 537 { 538 for ( int i = 0, s0 = indices.size(); i < s0; i++ ) 539 { 540 final int index = indices.get( i ); 541 final int cp = Character.toLowerCase( stringBuilder.codePointAt( index ) ); 542 stringBuilder.replace( index, index + 1, String.valueOf( Character.toChars( cp ) ) ); 543 } 544 } 545 546 private static String getMessage( final String key, final Object... args ) 547 { 548 return MessageFormat.format( ResourceBundle.getBundle( 549 JavaIdentifier.class.getName().replace( '.', '/' ), Locale.getDefault() ). 550 getString( key ), args ); 551 552 } 553 554 private static final class CacheKey 555 { 556 557 private final String text; 558 559 private final NormalizationMode mode; 560 561 private CacheKey( final String text, final NormalizationMode mode ) 562 { 563 super(); 564 this.text = text; 565 this.mode = mode; 566 } 567 568 @Override 569 public int hashCode() 570 { 571 int hc = 23; 572 hc = 37 * hc + this.text.hashCode(); 573 hc = 37 * hc + ( this.mode == null ? 0 : this.mode.hashCode() ); 574 return hc; 575 } 576 577 @Override 578 public boolean equals( final Object o ) 579 { 580 boolean equal = o == this; 581 582 if ( !equal && o instanceof CacheKey ) 583 { 584 final CacheKey that = (CacheKey) o; 585 equal = this.mode == that.mode && this.text.equals( that.text ); 586 } 587 588 return equal; 589 } 590 591 } 592 593}