public final class
NormalizerImpl

extends Object
/*
 * Copyright (c) 2003, 2006, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

/*
 *******************************************************************************
 * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved                     *
 *                                                                             *
 * The original version of this source code and documentation is copyrighted   *
 * and owned by IBM, These materials are provided under terms of a License     *
 * Agreement between IBM and Sun. This technology is protected by multiple     *
 * US and International patents. This notice and attribution to IBM may not    *
 * to removed.                                                                 *
 *******************************************************************************
 */

package sun.text.normalizer;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.BufferedInputStream;
import java.io.InputStream;

/**
 * @author  Ram Viswanadha
 */
public final class NormalizerImpl {
    // Static block for the class to initialize its own self
    static final NormalizerImpl IMPL;

    static
    {
        try
        {
            IMPL = new NormalizerImpl();
        }
        catch (Exception e)
        {
            throw new RuntimeException(e.getMessage());
        }
    }

    static final int UNSIGNED_BYTE_MASK =0xFF;
    static final long UNSIGNED_INT_MASK = 0xffffffffL;
    /*
     * This new implementation of the normalization code loads its data from
     * unorm.icu, which is generated with the gennorm tool.
     * The format of that file is described at the end of this file.
     */
    private static final String DATA_FILE_NAME = "/sun/text/resources/unorm.icu";

    // norm32 value constants

    // quick check flags 0..3 set mean "no" for their forms
    public static final int QC_NFC=0x11;          /* no|maybe */
    public static final int QC_NFKC=0x22;         /* no|maybe */
    public static final int QC_NFD=4;             /* no */
    public static final int QC_NFKD=8;            /* no */

    public static final int QC_ANY_NO=0xf;

    /* quick check flags 4..5 mean "maybe" for their forms;
     * test flags>=QC_MAYBE
     */
    public static final int QC_MAYBE=0x10;
    public static final int QC_ANY_MAYBE=0x30;

    public static final int QC_MASK=0x3f;

    private static final int COMBINES_FWD=0x40;
    private static final int COMBINES_BACK=0x80;
    public  static final int COMBINES_ANY=0xc0;
    // UnicodeData.txt combining class in bits 15.
    private static final int CC_SHIFT=8;
    public  static final int CC_MASK=0xff00;
    // 16 bits for the index to UChars and other extra data
    private static final int EXTRA_SHIFT=16;

    /* norm32 value constants using >16 bits */
    private static final long  MIN_SPECIAL    =  (long)(0xfc000000 & UNSIGNED_INT_MASK);
    private static final long  SURROGATES_TOP =  (long)(0xfff00000 & UNSIGNED_INT_MASK);
    private static final long  MIN_HANGUL     =  (long)(0xfff00000 & UNSIGNED_INT_MASK);
    private static final long  MIN_JAMO_V     =  (long)(0xfff20000 & UNSIGNED_INT_MASK);
    private static final long  JAMO_V_TOP     =  (long)(0xfff30000 & UNSIGNED_INT_MASK);


    /* indexes[] value names */
    /* number of bytes in normalization trie */
    static final int INDEX_TRIE_SIZE           = 0;
    /* number of chars in extra data */
    static final int INDEX_CHAR_COUNT           = 1;
    /* number of uint16_t words for combining data */
    static final int INDEX_COMBINE_DATA_COUNT = 2;
    /* first code point with quick check NFC NO/MAYBE */
    public static final int INDEX_MIN_NFC_NO_MAYBE   = 6;
    /* first code point with quick check NFKC NO/MAYBE */
    public static final int INDEX_MIN_NFKC_NO_MAYBE  = 7;
    /* first code point with quick check NFD NO/MAYBE */
    public static final int INDEX_MIN_NFD_NO_MAYBE   = 8;
    /* first code point with quick check NFKD NO/MAYBE */
    public static final int INDEX_MIN_NFKD_NO_MAYBE  = 9;
    /* number of bytes in FCD trie */
    static final int INDEX_FCD_TRIE_SIZE      = 10;
    /* number of bytes in the auxiliary trie */
    static final int INDEX_AUX_TRIE_SIZE      = 11;
    /* changing this requires a new formatVersion */
    static final int INDEX_TOP                = 32;


    /* AUX constants */
    /* value constants for auxTrie */
    private static final int AUX_UNSAFE_SHIFT           = 11;
    private static final int AUX_COMP_EX_SHIFT           = 10;
    private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12;

    private static final int AUX_MAX_FNC          =   ((int)1<<AUX_COMP_EX_SHIFT);
    private static final int AUX_UNSAFE_MASK      =   (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
    private static final int AUX_FNC_MASK         =   (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK);
    private static final int AUX_COMP_EX_MASK     =   (int)((1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK);
    private static final long AUX_NFC_SKIP_F_MASK =   ((UNSIGNED_INT_MASK&1)<<AUX_NFC_SKIPPABLE_F_SHIFT);

    private static final int MAX_BUFFER_SIZE                    = 20;

    /*******************************/

    /* Wrappers for Trie implementations */
    static final class NormTrieImpl implements Trie.DataManipulate{
        static IntTrie normTrie= null;
       /**
        * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
        * data the index array offset of the indexes for that lead surrogate.
        * @param property data value for a surrogate from the trie, including
        *         the folding offset
        * @return data offset or 0 if there is no data for the lead surrogate
        */
        /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
        public int getFoldingOffset(int value){
            return  BMP_INDEX_LENGTH+
                    ((value>>(EXTRA_SHIFT-SURROGATE_BLOCK_BITS))&
                    (0x3ff<<SURROGATE_BLOCK_BITS));
        }

    }
    static final class FCDTrieImpl implements Trie.DataManipulate{
        static CharTrie fcdTrie=null;
       /**
        * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
        * data the index array offset of the indexes for that lead surrogate.
        * @param property data value for a surrogate from the trie, including
        *         the folding offset
        * @return data offset or 0 if there is no data for the lead surrogate
        */
        /* fcdTrie: the folding offset is the lead FCD value itself */
        public int getFoldingOffset(int value){
            return value;
        }
    }

    static final class AuxTrieImpl implements Trie.DataManipulate{
        static CharTrie auxTrie = null;
       /**
        * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
        * data the index array offset of the indexes for that lead surrogate.
        * @param property data value for a surrogate from the trie, including
        *        the folding offset
        * @return data offset or 0 if there is no data for the lead surrogate
        */
        /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
        public int getFoldingOffset(int value){
            return (int)(value &AUX_FNC_MASK)<<SURROGATE_BLOCK_BITS;
        }
    }

    /****************************************************/


    private static FCDTrieImpl fcdTrieImpl;
    private static NormTrieImpl normTrieImpl;
    private static AuxTrieImpl auxTrieImpl;
    private static int[] indexes;
    private static char[] combiningTable;
    private static char[] extraData;

    private static boolean isDataLoaded;
    private static boolean isFormatVersion_2_1;
    private static boolean isFormatVersion_2_2;
    private static byte[] unicodeVersion;

    /**
     * Default buffer size of datafile
     */
    private static final int DATA_BUFFER_SIZE = 25000;

    /**
     * FCD check: everything below this code point is known to have a 0
     * lead combining class
     */
    public static final int MIN_WITH_LEAD_CC=0x300;


    /**
     * Bit 7 of the length byte for a decomposition string in extra data is
     * a flag indicating whether the decomposition string is
     * preceded by a 16-bit word with the leading and trailing cc
     * of the decomposition (like for A-umlaut);
     * if not, then both cc's are zero (like for compatibility ideographs).
     */
    private static final int DECOMP_FLAG_LENGTH_HAS_CC=0x80;
    /**
     * Bits 6..0 of the length byte contain the actual length.
     */
    private static final int DECOMP_LENGTH_MASK=0x7f;

    /** Length of the BMP portion of the index (stage 1) array. */
    private static final int BMP_INDEX_LENGTH=0x10000>>Trie.INDEX_STAGE_1_SHIFT_;
    /** Number of bits of a trail surrogate that are used in index table
     * lookups.
     */
    private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_;


   // public utility
   public static int getFromIndexesArr(int index){
        return indexes[index];
   }

   // protected constructor ---------------------------------------------

    /**
    * Constructor
    * @exception thrown when data reading fails or data corrupted
    */
    private NormalizerImpl() throws IOException {
        //data should be loaded only once
        if(!isDataLoaded){

            // jar access
            InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);
            BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE);
            NormalizerDataReader reader = new NormalizerDataReader(b);

            // read the indexes
            indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);

            byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];

            int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
            combiningTable = new char[combiningTableTop];

            int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
            extraData = new char[extraDataTop];

            byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
            byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];

            fcdTrieImpl = new FCDTrieImpl();
            normTrieImpl = new NormTrieImpl();
            auxTrieImpl = new AuxTrieImpl();

            // load the rest of the data data and initialize the data members
            reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable);

            NormTrieImpl.normTrie = new IntTrie( new ByteArrayInputStream(normBytes),normTrieImpl );
            FCDTrieImpl.fcdTrie   = new CharTrie( new ByteArrayInputStream(fcdBytes),fcdTrieImpl  );
            AuxTrieImpl.auxTrie   = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl  );

            // we reached here without any exceptions so the data is fully
            // loaded set the variable to true
            isDataLoaded = true;

            // get the data format version
            byte[] formatVersion = reader.getDataFormatVersion();

            isFormatVersion_2_1 =( formatVersion[0]>2
                                    ||
                                   (formatVersion[0]==2 && formatVersion[1]>=1)
                                 );
            isFormatVersion_2_2 =( formatVersion[0]>2
                                    ||
                                   (formatVersion[0]==2 && formatVersion[1]>=2)
                                 );
            unicodeVersion = reader.getUnicodeVersion();
            b.close();
        }
    }

    /* ---------------------------------------------------------------------- */

    /* Korean Hangul and Jamo constants */

    public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
    public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
    public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */

    public static final int HANGUL_BASE=0xac00;

    public static final int JAMO_L_COUNT=19;
    public static final int JAMO_V_COUNT=21;
    public static final int JAMO_T_COUNT=28;
    public  static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;

    private static boolean isHangulWithoutJamoT(char c) {
        c-=HANGUL_BASE;
        return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
    }

    /* norm32 helpers */

    /* is this a norm32 with a regular index? */
    private static boolean isNorm32Regular(long norm32) {
        return norm32<MIN_SPECIAL;
    }

    /* is this a norm32 with a special index for a lead surrogate? */
    private static boolean isNorm32LeadSurrogate(long norm32) {
        return MIN_SPECIAL<=norm32 && norm32<SURROGATES_TOP;
    }

    /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
    private static boolean isNorm32HangulOrJamo(long norm32) {
        return norm32>=MIN_HANGUL;
    }

    /*
     * Given norm32 for Jamo V or T,
     * is this a Jamo V?
     */
    private static boolean isJamoVTNorm32JamoV(long norm32) {
        return norm32<JAMO_V_TOP;
    }

    /* data access primitives ----------------------------------------------- */

    public static long/*unsigned*/ getNorm32(char c) {
        return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie.getLeadValue(c)));
    }

    public static long/*unsigned*/ getNorm32FromSurrogatePair(long norm32,
                                                               char c2) {
        /*
         * the surrogate index in norm32 stores only the number of the surrogate
         * index block see gennorm/store.c/getFoldedNormValue()
         */
        return ((UNSIGNED_INT_MASK) &
                    NormTrieImpl.normTrie.getTrailValue((int)norm32, c2));
    }
    ///CLOVER:OFF
    private static long getNorm32(int c){
        return (UNSIGNED_INT_MASK&(NormTrieImpl.normTrie.getCodePointValue(c)));
    }

    /*
     * get a norm32 from text with complete code points
     * (like from decompositions)
     */
    private static long/*unsigned*/ getNorm32(char[] p,int start,
                                              int/*unsigned*/ mask) {
        long/*unsigned*/ norm32= getNorm32(p[start]);
        if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) {
            /* *p is a lead surrogate, get the real norm32 */
            norm32=getNorm32FromSurrogatePair(norm32, p[start+1]);
        }
        return norm32;
    }

    //// for StringPrep
    public static VersionInfo getUnicodeVersion(){
        return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1],
                                       unicodeVersion[2], unicodeVersion[3]);
    }

    public static char    getFCD16(char c) {
        return  FCDTrieImpl.fcdTrie.getLeadValue(c);
    }

    public static char getFCD16FromSurrogatePair(char fcd16, char c2) {
        /* the surrogate index in fcd16 is an absolute offset over the
         * start of stage 1
         * */
        return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2);
    }
    public static int getFCD16(int c) {
        return  FCDTrieImpl.fcdTrie.getCodePointValue(c);
    }

    private static int getExtraDataIndex(long norm32) {
        return (int)(norm32>>EXTRA_SHIFT);
    }

    private static final class DecomposeArgs{
        int /*unsigned byte*/ cc;
        int /*unsigned byte*/ trailCC;
        int length;
    }
    /**
     *
     * get the canonical or compatibility decomposition for one character
     *
     * @return index into the extraData array
     */
    private static int/*index*/ decompose(long/*unsigned*/ norm32,
                                          int/*unsigned*/ qcMask,
                                          DecomposeArgs args) {
        int p= getExtraDataIndex(norm32);
        args.length=extraData[p++];

        if((norm32&qcMask&QC_NFKD)!=0 && args.length>=0x100) {
            /* use compatibility decomposition, skip canonical data */
            p+=((args.length>>7)&1)+(args.length&DECOMP_LENGTH_MASK);
            args.length>>=8;
        }

        if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
            /* get the lead and trail cc's */
            char bothCCs=extraData[p++];
            args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
            args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
        } else {
            /* lead and trail cc's are both 0 */
            args.cc=args.trailCC=0;
        }

        args.length&=DECOMP_LENGTH_MASK;
        return p;
    }


    /**
     * get the canonical decomposition for one character
     * @return index into the extraData array
     */
    private static int decompose(long/*unsigned*/ norm32,
                                 DecomposeArgs args) {

        int p= getExtraDataIndex(norm32);
        args.length=extraData[p++];

        if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) {
            /* get the lead and trail cc's */
            char bothCCs=extraData[p++];
            args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8);
            args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs;
        } else {
            /* lead and trail cc's are both 0 */
            args.cc=args.trailCC=0;
        }

        args.length&=DECOMP_LENGTH_MASK;
        return p;
    }


    private static final class NextCCArgs{
        char[] source;
        int next;
        int limit;
        char c;
        char c2;
    }

    /*
     * get the combining class of (c, c2)= args.source[args.next++]
     * before: args.next<args.limit  after: args.next<=args.limit
     * if only one code unit is used, then c2==0
     */
    private static int /*unsigned byte*/ getNextCC(NextCCArgs args) {
        long /*unsigned*/ norm32;

        args.c=args.source[args.next++];

        norm32= getNorm32(args.c);
        if((norm32 & CC_MASK)==0) {
            args.c2=0;
            return 0;
        } else {
            if(!isNorm32LeadSurrogate(norm32)) {
                args.c2=0;
            } else {
                /* c is a lead surrogate, get the real norm32 */
                if(args.next!=args.limit &&
                        UTF16.isTrailSurrogate(args.c2=args.source[args.next])){
                    ++args.next;
                    norm32=getNorm32FromSurrogatePair(norm32, args.c2);
                } else {
                    args.c2=0;
                    return 0;
                }
            }

            return (int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
        }
    }

    private static final class PrevArgs{
        char[] src;
        int start;
        int current;
        char c;
        char c2;
    }

    /*
     * read backwards and get norm32
     * return 0 if the character is <minC
     * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first
     * surrogate but read second!)
     */
    private static long /*unsigned*/ getPrevNorm32(PrevArgs args,
                                                      int/*unsigned*/ minC,
                                                      int/*unsigned*/ mask) {
        long/*unsigned*/ norm32;

        args.c=args.src[--args.current];
        args.c2=0;

        /* check for a surrogate before getting norm32 to see if we need to
         * predecrement further
         */
        if(args.c<minC) {
            return 0;
        } else if(!UTF16.isSurrogate(args.c)) {
            return getNorm32(args.c);
        } else if(UTF16.isLeadSurrogate(args.c)) {
            /* unpaired first surrogate */
            return 0;
        } else if(args.current!=args.start &&
                    UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) {
            --args.current;
            norm32=getNorm32(args.c2);

            if((norm32&mask)==0) {
                /* all surrogate pairs with this lead surrogate have
                 * only irrelevant data
                 */
                return 0;
            } else {
                /* norm32 must be a surrogate special */
                return getNorm32FromSurrogatePair(norm32, args.c);
            }
        } else {
            /* unpaired second surrogate */
            args.c2=0;
            return 0;
        }
    }

    /*
     * get the combining class of (c, c2)=*--p
     * before: start<p  after: start<=p
     */
    private static int /*unsigned byte*/ getPrevCC(PrevArgs args) {

        return (int)((UNSIGNED_BYTE_MASK)&(getPrevNorm32(args, MIN_WITH_LEAD_CC,
                                                         CC_MASK)>>CC_SHIFT));
    }

    /*
     * is this a safe boundary character for NF*D?
     * (lead cc==0)
     */
    public static boolean isNFDSafe(long/*unsigned*/ norm32,
                                     int/*unsigned*/ccOrQCMask,
                                     int/*unsigned*/ decompQCMask) {
        if((norm32&ccOrQCMask)==0) {
            return true; /* cc==0 and no decomposition: this is NF*D safe */
        }

        /* inspect its decomposition - maybe a Hangul but not a surrogate here*/
        if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
            DecomposeArgs args=new DecomposeArgs();
            /* decomposes, get everything from the variable-length extra data */
            decompose(norm32, decompQCMask, args);
            return args.cc==0;
        } else {
            /* no decomposition (or Hangul), test the cc directly */
            return (norm32&CC_MASK)==0;
        }
    }

    /*
     * is this (or does its decomposition begin with) a "true starter"?
     * (cc==0 and NF*C_YES)
     */
    public static boolean isTrueStarter(long/*unsigned*/ norm32,
                                          int/*unsigned*/ ccOrQCMask,
                                          int/*unsigned*/ decompQCMask) {
        if((norm32&ccOrQCMask)==0) {
            return true; /* this is a true starter (could be Hangul or Jamo L)*/
        }

        /* inspect its decomposition - not a Hangul or a surrogate here */
        if((norm32&decompQCMask)!=0) {
            int p; /* index into extra data array */
            DecomposeArgs args=new DecomposeArgs();
            /* decomposes, get everything from the variable-length extra data */
            p=decompose(norm32, decompQCMask, args);

            if(args.cc==0) {
                int/*unsigned*/ qcMask=ccOrQCMask&QC_MASK;

                /* does it begin with NFC_YES? */
                if((getNorm32(extraData,p, qcMask)&qcMask)==0) {
                    /* yes, the decomposition begins with a true starter */
                    return true;
                }
            }
        }
        return false;
    }

    /* reorder UTF-16 in-place ---------------------------------------------- */

    /**
     * simpler, single-character version of mergeOrdered() -
     * bubble-insert one single code point into the preceding string
     * which is already canonically ordered
     * (c, c2) may or may not yet have been inserted at src[current]..src[p]
     *
     * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
     *
     * before: src[start]..src[current] is already ordered, and
     *         src[current]..src[p]     may or may not hold (c, c2) but
     *                          must be exactly the same length as (c, c2)
     * after: src[start]..src[p] is ordered
     *
     * @return the trailing combining class
     */
    private static int/*unsigned byte*/ insertOrdered(char[] source,
                                                      int start,
                                                      int current, int p,
                                                         char c, char c2,
                                                         int/*unsigned byte*/ cc) {
        int back, preBack;
        int r;
        int prevCC, trailCC=cc;

        if(start<current && cc!=0) {
            // search for the insertion point where cc>=prevCC
            preBack=back=current;
            PrevArgs prevArgs = new PrevArgs();
            prevArgs.current  = current;
            prevArgs.start    = start;
            prevArgs.src      = source;
            // get the prevCC
            prevCC=getPrevCC(prevArgs);
            preBack = prevArgs.current;

            if(cc<prevCC) {
                // this will be the last code point, so keep its cc
                trailCC=prevCC;
                back=preBack;
                while(start<preBack) {
                    prevCC=getPrevCC(prevArgs);
                    preBack=prevArgs.current;
                    if(cc>=prevCC) {
                        break;
                    }
                    back=preBack;
                }


                // this is where we are right now with all these indicies:
                // [start]..[pPreBack] 0..? code points that we can ignore
                // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc
                // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2)
                // [current]..[p]         1 code point (c, c2) with cc

                // move the code units in between up
                r=p;
                do {
                    source[--r]=source[--current];
                } while(back!=current);
            }
        }

        // insert (c, c2)
        source[current]=c;
        if(c2!=0) {
            source[(current+1)]=c2;
        }

        // we know the cc of the last code point
        return trailCC;
    }

    /**
     * merge two UTF-16 string parts together
     * to canonically order (order by combining classes) their concatenation
     *
     * the two strings may already be adjacent, so that the merging is done
     * in-place if the two strings are not adjacent, then the buffer holding the
     * first one must be large enough
     * the second string may or may not be ordered in itself
     *
     * before: [start]..[current] is already ordered, and
     *         [next]..[limit]    may be ordered in itself, but
     *                          is not in relation to [start..current[
     * after: [start..current+(limit-next)[ is ordered
     *
     * the algorithm is a simple bubble-sort that takes the characters from
     * src[next++] and inserts them in correct combining class order into the
     * preceding part of the string
     *
     * since this function is called much less often than the single-code point
     * insertOrdered(), it just uses that for easier maintenance
     *
     * @return the trailing combining class
     */
    private static int /*unsigned byte*/ mergeOrdered(char[] source,
                                                      int start,
                                                      int current,
                                                      char[] data,
                                                        int next,
                                                        int limit,
                                                        boolean isOrdered) {
            int r;
            int /*unsigned byte*/ cc, trailCC=0;
            boolean adjacent;

            adjacent= current==next;
            NextCCArgs ncArgs = new NextCCArgs();
            ncArgs.source = data;
            ncArgs.next   = next;
            ncArgs.limit  = limit;

            if(start!=current || !isOrdered) {

                while(ncArgs.next<ncArgs.limit) {
                    cc=getNextCC(ncArgs);
                    if(cc==0) {
                        // does not bubble back
                        trailCC=0;
                        if(adjacent) {
                            current=ncArgs.next;
                        } else {
                            data[current++]=ncArgs.c;
                            if(ncArgs.c2!=0) {
                                data[current++]=ncArgs.c2;
                            }
                        }
                        if(isOrdered) {
                            break;
                        } else {
                            start=current;
                        }
                    } else {
                        r=current+(ncArgs.c2==0 ? 1 : 2);
                        trailCC=insertOrdered(source,start, current, r,
                                              ncArgs.c, ncArgs.c2, cc);
                        current=r;
                    }
                }
            }

            if(ncArgs.next==ncArgs.limit) {
                // we know the cc of the last code point
                return trailCC;
            } else {
                if(!adjacent) {
                    // copy the second string part
                    do {
                        source[current++]=data[ncArgs.next++];
                    } while(ncArgs.next!=ncArgs.limit);
                    ncArgs.limit=current;
                }
                PrevArgs prevArgs = new PrevArgs();
                prevArgs.src   = data;
                prevArgs.start = start;
                prevArgs.current =  ncArgs.limit;
                return getPrevCC(prevArgs);
            }

    }
    private static int /*unsigned byte*/ mergeOrdered(char[] source,
                                                      int start,
                                                      int current,
                                                      char[] data,
                                                        final int next,
                                                        final int limit) {
        return mergeOrdered(source,start,current,data,next,limit,true);
    }

    public static NormalizerBase.QuickCheckResult quickCheck(char[] src,
                                                            int srcStart,
                                                            int srcLimit,
                                                            int minNoMaybe,
                                                            int qcMask,
                                                            int options,
                                                            boolean allowMaybe,
                                                            UnicodeSet nx){

        int ccOrQCMask;
        long norm32;
        char c, c2;
        char cc, prevCC;
        long qcNorm32;
        NormalizerBase.QuickCheckResult result;
        ComposePartArgs args = new ComposePartArgs();
        char[] buffer ;
        int start = srcStart;

        if(!isDataLoaded) {
            return NormalizerBase.MAYBE;
        }
        // initialize
        ccOrQCMask=CC_MASK|qcMask;
        result=NormalizerBase.YES;
        prevCC=0;

        for(;;) {
            for(;;) {
                if(srcStart==srcLimit) {
                    return result;
                } else if((c=src[srcStart++])>=minNoMaybe &&
                                  (( norm32=getNorm32(c)) & ccOrQCMask)!=0) {
                    break;
                }
                prevCC=0;
            }


            // check one above-minimum, relevant code unit
            if(isNorm32LeadSurrogate(norm32)) {
                // c is a lead surrogate, get the real norm32
                if(srcStart!=srcLimit&& UTF16.isTrailSurrogate(c2=src[srcStart])) {
                    ++srcStart;
                    norm32=getNorm32FromSurrogatePair(norm32,c2);
                } else {
                    norm32=0;
                    c2=0;
                }
            }else{
                c2=0;
            }
            if(nx_contains(nx, c, c2)) {
                /* excluded: norm32==0 */
                norm32=0;
            }

            // check the combining order
            cc=(char)((norm32>>CC_SHIFT)&0xFF);
            if(cc!=0 && cc<prevCC) {
                return NormalizerBase.NO;
            }
            prevCC=cc;

            // check for "no" or "maybe" quick check flags
            qcNorm32 = norm32 & qcMask;
            if((qcNorm32& QC_ANY_NO)>=1) {
                result= NormalizerBase.NO;
                break;
            } else if(qcNorm32!=0) {
                // "maybe" can only occur for NFC and NFKC
                if(allowMaybe){
                    result=NormalizerBase.MAYBE;
                }else{
                    // normalize a section around here to see if it is really
                    // normalized or not
                    int prevStarter;
                    int/*unsigned*/ decompQCMask;

                    decompQCMask=(qcMask<<2)&0xf; // decomposition quick check mask

                    // find the previous starter

                    // set prevStarter to the beginning of the current character
                    prevStarter=srcStart-1;
                    if(UTF16.isTrailSurrogate(src[prevStarter])) {
                        // safe because unpaired surrogates do not result
                        // in "maybe"
                        --prevStarter;
                    }
                    prevStarter=findPreviousStarter(src, start, prevStarter,
                                                    ccOrQCMask, decompQCMask,
                                                    (char)minNoMaybe);

                    // find the next true starter in [src..limit[ - modifies
                    // src to point to the next starter
                    srcStart=findNextStarter(src,srcStart, srcLimit, qcMask,
                                             decompQCMask,(char) minNoMaybe);

                    //set the args for compose part
                    args.prevCC = prevCC;

                    // decompose and recompose [prevStarter..src[
                    buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx);

                    // compare the normalized version with the original
                    if(0!=strCompare(buffer,0,args.length,src,prevStarter,(srcStart-prevStarter), false)) {
                        result=NormalizerBase.NO; // normalization differs
                        break;
                    }

                    // continue after the next starter
                }
            }
        }
        return result;
    }


    //------------------------------------------------------
    // make NFD & NFKD
    //------------------------------------------------------

    public static int decompose(char[] src,int srcStart,int srcLimit,
                                char[] dest,int destStart,int destLimit,
                                 boolean compat,int[] outTrailCC,
                                 UnicodeSet nx) {

        char[] buffer = new char[3];
        int prevSrc;
        long norm32;
        int ccOrQCMask, qcMask;
        int reorderStartIndex, length;
        char c, c2, minNoMaybe;
        int/*unsigned byte*/ cc, prevCC, trailCC;
        char[] p;
        int pStart;
        int destIndex = destStart;
        int srcIndex = srcStart;
        if(!compat) {
            minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE];
            qcMask=QC_NFD;
        } else {
            minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE];
            qcMask=QC_NFKD;
        }

        /* initialize */
        ccOrQCMask=CC_MASK|qcMask;
        reorderStartIndex=0;
        prevCC=0;
        norm32=0;
        c=0;
        pStart=0;

        cc=trailCC=-1;//initialize to bogus value

        for(;;) {
            /* count code units below the minimum or with irrelevant data for
             * the quick check
             */
            prevSrc=srcIndex;

            while(srcIndex!=srcLimit &&((c=src[srcIndex])<minNoMaybe ||
                                        ((norm32=getNorm32(c))&ccOrQCMask)==0)){
                prevCC=0;
                ++srcIndex;
            }

            /* copy these code units all at once */
            if(srcIndex!=prevSrc) {
                length=(int)(srcIndex-prevSrc);
                if((destIndex+length)<=destLimit) {
                    System.arraycopy(src,prevSrc,dest,destIndex,length);
                }

                destIndex+=length;
                reorderStartIndex=destIndex;
            }

            /* end of source reached? */
            if(srcIndex==srcLimit) {
                break;
            }

            /* c already contains *src and norm32 is set for it, increment src*/
            ++srcIndex;

            /* check one above-minimum, relevant code unit */
            /*
             * generally, set p and length to the decomposition string
             * in simple cases, p==NULL and (c, c2) will hold the length code
             * units to append in all cases, set cc to the lead and trailCC to
             * the trail combining class
             *
             * the following merge-sort of the current character into the
             * preceding, canonically ordered result text will use the
             * optimized insertOrdered()
             * if there is only one single code point to process;
             * this is indicated with p==NULL, and (c, c2) is the character to
             * insert
             * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
             * for a supplementary character)
             * otherwise, p[length] is merged in with _mergeOrdered()
             */
            if(isNorm32HangulOrJamo(norm32)) {
                if(nx_contains(nx, c)) {
                    c2=0;
                    p=null;
                    length=1;
                } else {
                    // Hangul syllable: decompose algorithmically
                    p=buffer;
                    pStart=0;
                    cc=trailCC=0;

                    c-=HANGUL_BASE;

                    c2=(char)(c%JAMO_T_COUNT);
                    c/=JAMO_T_COUNT;
                    if(c2>0) {
                        buffer[2]=(char)(JAMO_T_BASE+c2);
                        length=3;
                    } else {
                        length=2;
                    }

                    buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT);
                    buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT);
                }
            } else {
                if(isNorm32Regular(norm32)) {
                    c2=0;
                    length=1;
                } else {
                    // c is a lead surrogate, get the real norm32
                    if(srcIndex!=srcLimit &&
                                    UTF16.isTrailSurrogate(c2=src[srcIndex])) {
                        ++srcIndex;
                        length=2;
                        norm32=getNorm32FromSurrogatePair(norm32, c2);
                    } else {
                        c2=0;
                        length=1;
                        norm32=0;
                    }
                }

                /* get the decomposition and the lead and trail cc's */
                if(nx_contains(nx, c, c2)) {
                    /* excluded: norm32==0 */
                    cc=trailCC=0;
                    p=null;
                } else if((norm32&qcMask)==0) {
                    /* c does not decompose */
                    cc=trailCC=(int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT));
                    p=null;
                    pStart=-1;
                } else {
                    DecomposeArgs arg = new DecomposeArgs();
                    /* c decomposes, get everything from the variable-length
                     * extra data
                     */
                    pStart=decompose(norm32, qcMask, arg);
                    p=extraData;
                    length=arg.length;
                    cc=arg.cc;
                    trailCC=arg.trailCC;
                    if(length==1) {
                        /* fastpath a single code unit from decomposition */
                        c=p[pStart];
                        c2=0;
                        p=null;
                        pStart=-1;
                    }
                }
            }

            /* append the decomposition to the destination buffer, assume
             * length>0
             */
            if((destIndex+length)<=destLimit) {
                int reorderSplit=destIndex;
                if(p==null) {
                    /* fastpath: single code point */
                    if(cc!=0 && cc<prevCC) {
                        /* (c, c2) is out of order with respect to the preceding
                         *  text
                         */
                        destIndex+=length;
                        trailCC=insertOrdered(dest,reorderStartIndex,
                                            reorderSplit, destIndex, c, c2, cc);
                    } else {
                        /* just append (c, c2) */
                        dest[destIndex++]=c;
                        if(c2!=0) {
                            dest[destIndex++]=c2;
                        }
                    }
                } else {
                    /* general: multiple code points (ordered by themselves)
                     * from decomposition
                     */
                    if(cc!=0 && cc<prevCC) {
                        /* the decomposition is out of order with respect to the
                         *  preceding text
                         */
                        destIndex+=length;
                        trailCC=mergeOrdered(dest,reorderStartIndex,
                                          reorderSplit,p, pStart,pStart+length);
                    } else {
                        /* just append the decomposition */
                        do {
                            dest[destIndex++]=p[pStart++];
                        } while(--length>0);
                    }
                }
            } else {
                /* buffer overflow */
                /* keep incrementing the destIndex for preflighting */
                destIndex+=length;
            }

            prevCC=trailCC;
            if(prevCC==0) {
                reorderStartIndex=destIndex;
            }
        }

        outTrailCC[0]=prevCC;

        return destIndex - destStart;
    }

    /* make NFC & NFKC ------------------------------------------------------ */
    private static final class NextCombiningArgs{
        char[] source;
        int start;
        //int limit;
        char c;
        char c2;
        int/*unsigned*/ combiningIndex;
        char /*unsigned byte*/ cc;
    }

    /* get the composition properties of the next character */
    private static int /*unsigned*/    getNextCombining(NextCombiningArgs args,
                                                    int limit,
                                                    UnicodeSet nx) {
        long/*unsigned*/ norm32;
        int combineFlags;
        /* get properties */
        args.c=args.source[args.start++];
        norm32=getNorm32(args.c);

        /* preset output values for most characters */
        args.c2=0;
        args.combiningIndex=0;
        args.cc=0;

        if((norm32&(CC_MASK|COMBINES_ANY))==0) {
            return 0;
        } else {
            if(isNorm32Regular(norm32)) {
                /* set cc etc. below */
            } else if(isNorm32HangulOrJamo(norm32)) {
                /* a compatibility decomposition contained Jamos */
                args.combiningIndex=(int)((UNSIGNED_INT_MASK)&(0xfff0|
                                                        (norm32>>EXTRA_SHIFT)));
                return (int)(norm32&COMBINES_ANY);
            } else {
                /* c is a lead surrogate, get the real norm32 */
                if(args.start!=limit && UTF16.isTrailSurrogate(args.c2=
                                                     args.source[args.start])) {
                    ++args.start;
                    norm32=getNorm32FromSurrogatePair(norm32, args.c2);
                } else {
                    args.c2=0;
                    return 0;
                }
            }

            if(nx_contains(nx, args.c, args.c2)) {
                return 0; /* excluded: norm32==0 */
            }

            args.cc= (char)((norm32>>CC_SHIFT)&0xff);

            combineFlags=(int)(norm32&COMBINES_ANY);
            if(combineFlags!=0) {
                int index = getExtraDataIndex(norm32);
                args.combiningIndex=index>0 ? extraData[(index-1)] :0;
            }

            return combineFlags;
        }
    }

    /*
     * given a composition-result starter (c, c2) - which means its cc==0,
     * it combines forward, it has extra data, its norm32!=0,
     * it is not a Hangul or Jamo,
     * get just its combineFwdIndex
     *
     * norm32(c) is special if and only if c2!=0
     */
    private static int/*unsigned*/ getCombiningIndexFromStarter(char c,char c2){
        long/*unsigned*/ norm32;

        norm32=getNorm32(c);
        if(c2!=0) {
            norm32=getNorm32FromSurrogatePair(norm32, c2);
        }
        return extraData[(getExtraDataIndex(norm32)-1)];
    }

    /*
     * Find the recomposition result for
     * a forward-combining character
     * (specified with a pointer to its part of the combiningTable[])
     * and a backward-combining character
     * (specified with its combineBackIndex).
     *
     * If these two characters combine, then set (value, value2)
     * with the code unit(s) of the composition character.
     *
     * Return value:
     * 0    do not combine
     * 1    combine
     * >1   combine, and the composition is a forward-combining starter
     *
     * See unormimp.h for a description of the composition table format.
     */
    private static int/*unsigned*/ combine(char[]table,int tableStart,
                                   int/*unsinged*/ combineBackIndex,
                                    int[] outValues) {
        int/*unsigned*/ key;
        int value,value2;

        if(outValues.length<2){
            throw new IllegalArgumentException();
        }

        /* search in the starter's composition table */
        for(;;) {
            key=table[tableStart++];
            if(key>=combineBackIndex) {
                break;
            }
            tableStart+= ((table[tableStart]&0x8000) != 0)? 2 : 1;
        }

        /* mask off bit 15, the last-entry-in-the-list flag */
        if((key&0x7fff)==combineBackIndex) {
            /* found! combine! */
            value=table[tableStart];

            /* is the composition a starter that combines forward? */
            key=(int)((UNSIGNED_INT_MASK)&((value&0x2000)+1));

            /* get the composition result code point from the variable-length
             * result value
             */
            if((value&0x8000) != 0) {
                if((value&0x4000) != 0) {
                    /* surrogate pair composition result */
                    value=(int)((UNSIGNED_INT_MASK)&((value&0x3ff)|0xd800));
                    value2=table[tableStart+1];
                } else {
                    /* BMP composition result U+2000..U+ffff */
                    value=table[tableStart+1];
                    value2=0;
                }
            } else {
                /* BMP composition result U+0000..U+1fff */
                value&=0x1fff;
                value2=0;
            }
            outValues[0]=value;
            outValues[1]=value2;
            return key;
        } else {
            /* not found */
            return 0;
        }
    }


    private static final class RecomposeArgs{
        char[] source;
        int start;
        int limit;
    }
    /*
     * recompose the characters in [p..limit[
     * (which is in NFD - decomposed and canonically ordered),
     * adjust limit, and return the trailing cc
     *
     * since for NFKC we may get Jamos in decompositions, we need to
     * recompose those too
     *
     * note that recomposition never lengthens the text:
     * any character consists of either one or two code units;
     * a composition may contain at most one more code unit than the original
     * starter, while the combining mark that is removed has at least one code
     * unit
     */
    private static char/*unsigned byte*/ recompose(RecomposeArgs args, int options, UnicodeSet nx) {
        int  remove, q, r;
        int /*unsigned*/ combineFlags;
        int /*unsigned*/ combineFwdIndex, combineBackIndex;
        int /*unsigned*/ result, value=0, value2=0;
        int /*unsigned byte*/  prevCC;
        boolean starterIsSupplementary;
        int starter;
        int[] outValues = new int[2];
        starter=-1;                   /* no starter */
        combineFwdIndex=0;            /* will not be used until starter!=NULL */
        starterIsSupplementary=false; /* will not be used until starter!=NULL */
        prevCC=0;

        NextCombiningArgs ncArg = new NextCombiningArgs();
        ncArg.source  = args.source;

        ncArg.cc      =0;
        ncArg.c2      =0;

        for(;;) {
            ncArg.start = args.start;
            combineFlags=getNextCombining(ncArg,args.limit,nx);
            combineBackIndex=ncArg.combiningIndex;
            args.start = ncArg.start;

            if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) {
                if((combineBackIndex&0x8000)!=0) {
                    /* c is a Jamo V/T, see if we can compose it with the
                     * previous character
                     */
                    /* for the PRI #29 fix, check that there is no intervening combining mark */
                    if((options&BEFORE_PRI_29)!=0 || prevCC==0) {
                        remove=-1; /* NULL while no Hangul composition */
                        combineFlags=0;
                        ncArg.c2=args.source[starter];
                        if(combineBackIndex==0xfff2) {
                            /* Jamo V, compose with previous Jamo L and following
                             * Jamo T
                             */
                            ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE);
                            if(ncArg.c2<JAMO_L_COUNT) {
                                remove=args.start-1;
                                ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+
                                               (ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT);
                                if(args.start!=args.limit &&
                                            (ncArg.c2=(char)(args.source[args.start]
                                             -JAMO_T_BASE))<JAMO_T_COUNT) {
                                    ++args.start;
                                    ncArg.c+=ncArg.c2;
                                 } else {
                                     /* the result is an LV syllable, which is a starter (unlike LVT) */
                                     combineFlags=COMBINES_FWD;
                                }
                                if(!nx_contains(nx, ncArg.c)) {
                                    args.source[starter]=ncArg.c;
                                   } else {
                                    /* excluded */
                                    if(!isHangulWithoutJamoT(ncArg.c)) {
                                        --args.start; /* undo the ++args.start from reading the Jamo T */
                                    }
                                    /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
                                    remove=args.start;
                                }
                            }

                        /*
                         * Normally, the following can not occur:
                         * Since the input is in NFD, there are no Hangul LV syllables that
                         * a Jamo T could combine with.
                         * All Jamo Ts are combined above when handling Jamo Vs.
                         *
                         * However, before the PRI #29 fix, this can occur due to
                         * an intervening combining mark between the Hangul LV and the Jamo T.
                         */
                        } else {
                            /* Jamo T, compose with previous Hangul that does not have a Jamo T */
                            if(isHangulWithoutJamoT(ncArg.c2)) {
                                ncArg.c2+=ncArg.c-JAMO_T_BASE;
                                if(!nx_contains(nx, ncArg.c2)) {
                                    remove=args.start-1;
                                    args.source[starter]=ncArg.c2;
                                }
                            }
                        }

                        if(remove!=-1) {
                            /* remove the Jamo(s) */
                            q=remove;
                            r=args.start;
                            while(r<args.limit) {
                                args.source[q++]=args.source[r++];
                            }
                            args.start=remove;
                            args.limit=q;
                        }

                        ncArg.c2=0; /* c2 held *starter temporarily */

                        if(combineFlags!=0) {
                            /*
                             * not starter=NULL because the composition is a Hangul LV syllable
                             * and might combine once more (but only before the PRI #29 fix)
                             */

                            /* done? */
                            if(args.start==args.limit) {
                                return (char)prevCC;
                            }

                            /* the composition is a Hangul LV syllable which is a starter that combines forward */
                            combineFwdIndex=0xfff0;

                            /* we combined; continue with looking for compositions */
                            continue;
                        }
                    }

                    /*
                     * now: cc==0 and the combining index does not include
                     * "forward" -> the rest of the loop body will reset starter
                     * to NULL; technically, a composed Hangul syllable is a
                     * starter, but it does not combine forward now that we have
                     * consumed all eligible Jamos; for Jamo V/T, combineFlags
                     * does not contain _NORM_COMBINES_FWD
                     */

                } else if(
                    /* the starter is not a Hangul LV or Jamo V/T and */
                    !((combineFwdIndex&0x8000)!=0) &&
                    /* the combining mark is not blocked and */
                    ((options&BEFORE_PRI_29)!=0 ?
                        (prevCC!=ncArg.cc || prevCC==0) :
                        (prevCC<ncArg.cc || prevCC==0)) &&
                    /* the starter and the combining mark (c, c2) do combine */
                    0!=(result=combine(combiningTable,combineFwdIndex,
                                       combineBackIndex, outValues)) &&
                    /* the composition result is not excluded */
                    !nx_contains(nx, (char)value, (char)value2)
                ) {
                    value=outValues[0];
                    value2=outValues[1];
                    /* replace the starter with the composition, remove the
                     * combining mark
                     */
                    remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */

                    /* replace the starter with the composition */
                    args.source[starter]=(char)value;
                    if(starterIsSupplementary) {
                        if(value2!=0) {
                            /* both are supplementary */
                            args.source[starter+1]=(char)value2;
                        } else {
                            /* the composition is shorter than the starter,
                             * move the intermediate characters forward one */
                            starterIsSupplementary=false;
                            q=starter+1;
                            r=q+1;
                            while(r<remove) {
                                args.source[q++]=args.source[r++];
                            }
                            --remove;
                        }
                    } else if(value2!=0) {
                        /* the composition is longer than the starter,
                         * move the intermediate characters back one */
                        starterIsSupplementary=true;
                        /* temporarily increment for the loop boundary */
                        ++starter;
                        q=remove;
                        r=++remove;
                        while(starter<q) {
                            args.source[--r]=args.source[--q];
                        }
                        args.source[starter]=(char)value2;
                        --starter; /* undo the temporary increment */
                    /* } else { both are on the BMP, nothing more to do */
                    }

                    /* remove the combining mark by moving the following text
                     * over it */
                    if(remove<args.start) {
                        q=remove;
                        r=args.start;
                        while(r<args.limit) {
                            args.source[q++]=args.source[r++];
                        }
                        args.start=remove;
                        args.limit=q;
                    }

                    /* keep prevCC because we removed the combining mark */

                    /* done? */
                    if(args.start==args.limit) {
                        return (char)prevCC;
                    }

                    /* is the composition a starter that combines forward? */
                    if(result>1) {
                       combineFwdIndex=getCombiningIndexFromStarter((char)value,
                                                                  (char)value2);
                    } else {
                       starter=-1;
                    }

                    /* we combined; continue with looking for compositions */
                    continue;
                }
            }

            /* no combination this time */
            prevCC=ncArg.cc;
            if(args.start==args.limit) {
                return (char)prevCC;
            }

            /* if (c, c2) did not combine, then check if it is a starter */
            if(ncArg.cc==0) {
                /* found a new starter; combineFlags==0 if (c, c2) is excluded */
                if((combineFlags&COMBINES_FWD)!=0) {
                    /* it may combine with something, prepare for it */
                    if(ncArg.c2==0) {
                        starterIsSupplementary=false;
                        starter=args.start-1;
                    } else {
                        starterIsSupplementary=false;
                        starter=args.start-2;
                    }
                    combineFwdIndex=combineBackIndex;
                } else {
                    /* it will not combine with anything */
                    starter=-1;
                }
            } else if((options&OPTIONS_COMPOSE_CONTIGUOUS)!=0) {
                /* FCC: no discontiguous compositions; any intervening character blocks */
                starter=-1;
            }
        }
    }

    // find the last true starter between src[start]....src[current] going
    // backwards and return its index
    private static int findPreviousStarter(char[]src, int srcStart, int current,
                                          int/*unsigned*/ ccOrQCMask,
                                          int/*unsigned*/ decompQCMask,
                                          char minNoMaybe) {
       long norm32;
       PrevArgs args = new PrevArgs();
       args.src = src;
       args.start = srcStart;
       args.current = current;

       while(args.start<args.current) {
           norm32= getPrevNorm32(args, minNoMaybe, ccOrQCMask|decompQCMask);
           if(isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
               break;
           }
       }
       return args.current;
    }

    /* find the first true starter in [src..limit[ and return the
     * pointer to it
     */
    private static int/*index*/    findNextStarter(char[] src,int start,int limit,
                                                 int/*unsigned*/ qcMask,
                                                 int/*unsigned*/ decompQCMask,
                                                 char minNoMaybe) {
        int p;
        long/*unsigned*/ norm32;
        int ccOrQCMask;
        char c, c2;

        ccOrQCMask=CC_MASK|qcMask;

        DecomposeArgs decompArgs = new DecomposeArgs();

        for(;;) {
            if(start==limit) {
                break; /* end of string */
            }
            c=src[start];
            if(c<minNoMaybe) {
                break; /* catches NUL terminater, too */
            }

            norm32=getNorm32(c);
            if((norm32&ccOrQCMask)==0) {
                break; /* true starter */
            }

            if(isNorm32LeadSurrogate(norm32)) {
                /* c is a lead surrogate, get the real norm32 */
                if((start+1)==limit ||
                                   !UTF16.isTrailSurrogate(c2=(src[start+1]))){
                    /* unmatched first surrogate: counts as a true starter */
                    break;
                }
                norm32=getNorm32FromSurrogatePair(norm32, c2);

                if((norm32&ccOrQCMask)==0) {
                    break; /* true starter */
                }
            } else {
                c2=0;
            }

            /* (c, c2) is not a true starter but its decomposition may be */
            if((norm32&decompQCMask)!=0) {
                /* (c, c2) decomposes, get everything from the variable-length
                 *  extra data */
                p=decompose(norm32, decompQCMask, decompArgs);

                /* get the first character's norm32 to check if it is a true
                 * starter */
                if(decompArgs.cc==0 && (getNorm32(extraData,p, qcMask)&qcMask)==0) {
                    break; /* true starter */
                }
            }

            start+= c2==0 ? 1 : 2; /* not a true starter, continue */
        }

        return start;
    }


    private static final class ComposePartArgs{
        int prevCC;
        int length;   /* length of decomposed part */
    }

     /* decompose and recompose [prevStarter..src[ */
    private static char[] composePart(ComposePartArgs args,
                                      int prevStarter,
                                         char[] src, int start, int limit,
                                       int options,
                                       UnicodeSet nx) {
        int recomposeLimit;
        boolean compat =((options&OPTIONS_COMPAT)!=0);

        /* decompose [prevStarter..src[ */
        int[] outTrailCC = new int[1];
        char[] buffer = new char[(limit-prevStarter)*MAX_BUFFER_SIZE];

        for(;;){
            args.length=decompose(src,prevStarter,(start),
                                      buffer,0,buffer.length,
                                      compat,outTrailCC,nx);
            if(args.length<=buffer.length){
                break;
            }else{
                buffer = new char[args.length];
            }
        }

        /* recompose the decomposition */
        recomposeLimit=args.length;

        if(args.length>=2) {
            RecomposeArgs rcArgs = new RecomposeArgs();
            rcArgs.source    = buffer;
            rcArgs.start    = 0;
            rcArgs.limit    = recomposeLimit;
            args.prevCC=recompose(rcArgs, options, nx);
            recomposeLimit = rcArgs.limit;
        }

        /* return with a pointer to the recomposition and its length */
        args.length=recomposeLimit;
        return buffer;
    }

    private static boolean composeHangul(char prev, char c,
                                         long/*unsigned*/ norm32,
                                         char[] src,int[] srcIndex, int limit,
                                            boolean compat,
                                         char[] dest,int destIndex,
                                         UnicodeSet nx) {
        int start=srcIndex[0];
        if(isJamoVTNorm32JamoV(norm32)) {
            /* c is a Jamo V, compose with previous Jamo L and
             * following Jamo T */
            prev=(char)(prev-JAMO_L_BASE);
            if(prev<JAMO_L_COUNT) {
                c=(char)(HANGUL_BASE+(prev*JAMO_V_COUNT+
                                                 (c-JAMO_V_BASE))*JAMO_T_COUNT);

                /* check if the next character is a Jamo T (normal or
                 * compatibility) */
                if(start!=limit) {
                    char next, t;

                    next=src[start];
                    if((t=(char)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
                        /* normal Jamo T */
                        ++start;
                        c+=t;
                    } else if(compat) {
                        /* if NFKC, then check for compatibility Jamo T
                         * (BMP only) */
                        norm32=getNorm32(next);
                        if(isNorm32Regular(norm32) && ((norm32&QC_NFKD)!=0)) {
                            int p /*index into extra data array*/;
                            DecomposeArgs dcArgs = new DecomposeArgs();
                            p=decompose(norm32, QC_NFKD, dcArgs);
                            if(dcArgs.length==1 &&
                                   (t=(char)(extraData[p]-JAMO_T_BASE))
                                                   <JAMO_T_COUNT) {
                                /* compatibility Jamo T */
                                ++start;
                                c+=t;
                            }
                        }
                    }
                }
                if(nx_contains(nx, c)) {
                    if(!isHangulWithoutJamoT(c)) {
                        --start; /* undo ++start from reading the Jamo T */
                    }
                    return false;
                }
                dest[destIndex]=c;
                srcIndex[0]=start;
                return true;
            }
        } else if(isHangulWithoutJamoT(prev)) {
            /* c is a Jamo T, compose with previous Hangul LV that does not
             * contain a Jamo T */
            c=(char)(prev+(c-JAMO_T_BASE));
            if(nx_contains(nx, c)) {
                return false;
            }
            dest[destIndex]=c;
            srcIndex[0]=start;
            return true;
        }
        return false;
    }
    /*
    public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){
        return compose(src,0,src.length,dest,0,dest.length,compat, nx);
    }
    */

    public static int compose(char[] src, int srcStart, int srcLimit,
                              char[] dest,int destStart,int destLimit,
                              int options,UnicodeSet nx) {

        int prevSrc, prevStarter;
        long/*unsigned*/ norm32;
        int ccOrQCMask, qcMask;
        int  reorderStartIndex, length;
        char c, c2, minNoMaybe;
        int/*unsigned byte*/ cc, prevCC;
        int[] ioIndex = new int[1];
        int destIndex = destStart;
        int srcIndex = srcStart;

        if((options&OPTIONS_COMPAT)!=0) {
            minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE];
            qcMask=QC_NFKC;
        } else {
            minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE];
            qcMask=QC_NFC;
        }

        /*
         * prevStarter points to the last character before the current one
         * that is a "true" starter with cc==0 and quick check "yes".
         *
         * prevStarter will be used instead of looking for a true starter
         * while incrementally decomposing [prevStarter..prevSrc[
         * in _composePart(). Having a good prevStarter allows to just decompose
         * the entire [prevStarter..prevSrc[.
         *
         * When _composePart() backs out from prevSrc back to prevStarter,
         * then it also backs out destIndex by the same amount.
         * Therefore, at all times, the (prevSrc-prevStarter) source units
         * must correspond 1:1 to destination units counted with destIndex,
         * except for reordering.
         * This is true for the qc "yes" characters copied in the fast loop,
         * and for pure reordering.
         * prevStarter must be set forward to src when this is not true:
         * In _composePart() and after composing a Hangul syllable.
         *
         * This mechanism relies on the assumption that the decomposition of a
         * true starter also begins with a true starter. gennorm/store.c checks
         * for this.
         */
        prevStarter=srcIndex;

        ccOrQCMask=CC_MASK|qcMask;
        /*destIndex=*/reorderStartIndex=0;/* ####TODO#### check this **/
        prevCC=0;

        /* avoid compiler warnings */
        norm32=0;
        c=0;

        for(;;) {
            /* count code units below the minimum or with irrelevant data for
             * the quick check */
            prevSrc=srcIndex;

            while(srcIndex!=srcLimit && ((c=src[srcIndex])<minNoMaybe ||
                     ((norm32=getNorm32(c))&ccOrQCMask)==0)) {
                prevCC=0;
                ++srcIndex;
            }


            /* copy these code units all at once */
            if(srcIndex!=prevSrc) {
                length=(int)(srcIndex-prevSrc);
                if((destIndex+length)<=destLimit) {
                    System.arraycopy(src,prevSrc,dest,destIndex,length);
                }
                destIndex+=length;
                reorderStartIndex=destIndex;

                /* set prevStarter to the last character in the quick check
                 * loop */
                prevStarter=srcIndex-1;
                if(UTF16.isTrailSurrogate(src[prevStarter]) &&
                    prevSrc<prevStarter &&
                    UTF16.isLeadSurrogate(src[(prevStarter-1)])) {
                    --prevStarter;
                }

                prevSrc=srcIndex;
            }

            /* end of source reached? */
            if(srcIndex==srcLimit) {
                break;
            }

            /* c already contains *src and norm32 is set for it, increment src*/
            ++srcIndex;

            /*
             * source buffer pointers:
             *
             *  all done      quick check   current char  not yet
             *                "yes" but     (c, c2)       processed
             *                may combine
             *                forward
             * [-------------[-------------[-------------[-------------[
             * |             |             |             |             |
             * start         prevStarter   prevSrc       src           limit
             *
             *
             * destination buffer pointers and indexes:
             *
             *  all done      might take    not filled yet
             *                characters for
             *                reordering
             * [-------------[-------------[-------------[
             * |             |             |             |
             * dest      reorderStartIndex destIndex     destCapacity
             */

            /* check one above-minimum, relevant code unit */
            /*
             * norm32 is for c=*(src-1), and the quick check flag is "no" or
             * "maybe", and/or cc!=0
             * check for Jamo V/T, then for surrogates and regular characters
             * c is not a Hangul syllable or Jamo L because
             * they are not marked with no/maybe for NFC & NFKC(and their cc==0)
             */
            if(isNorm32HangulOrJamo(norm32)) {
                /*
                 * c is a Jamo V/T:
                 * try to compose with the previous character, Jamo V also with
                 * a following Jamo T, and set values here right now in case we
                 * just continue with the main loop
                 */
                prevCC=cc=0;
                reorderStartIndex=destIndex;
                ioIndex[0]=srcIndex;
                if(
                    destIndex>0 &&
                    composeHangul(src[(prevSrc-1)], c, norm32,src, ioIndex,
                                  srcLimit, (options&OPTIONS_COMPAT)!=0, dest,
                                  destIndex<=destLimit ? destIndex-1: 0,
                                  nx)
                ) {
                    srcIndex=ioIndex[0];
                    prevStarter=srcIndex;
                    continue;
                }

                srcIndex = ioIndex[0];

                /* the Jamo V/T did not compose into a Hangul syllable, just
                 * append to dest */
                c2=0;
                length=1;
                prevStarter=prevSrc;
            } else {
                if(isNorm32Regular(norm32)) {
                    c2=0;
                    length=1;
                } else {
                    /* c is a lead surrogate, get the real norm32 */
                    if(srcIndex!=srcLimit &&
                                     UTF16.isTrailSurrogate(c2=src[srcIndex])) {
                        ++srcIndex;
                        length=2;
                        norm32=getNorm32FromSurrogatePair(norm32, c2);
                    } else {
                        /* c is an unpaired lead surrogate, nothing to do */
                        c2=0;
                        length=1;
                        norm32=0;
                    }
                }
                ComposePartArgs args =new ComposePartArgs();

                /* we are looking at the character (c, c2) at [prevSrc..src[ */
                if(nx_contains(nx, c, c2)) {
                    /* excluded: norm32==0 */
                    cc=0;
                } else if((norm32&qcMask)==0) {
                    cc=(int)((UNSIGNED_BYTE_MASK)&(norm32>>CC_SHIFT));
                } else {
                    char[] p;

                    /*
                     * find appropriate boundaries around this character,
                     * decompose the source text from between the boundaries,
                     * and recompose it
                     *
                     * this puts the intermediate text into the side buffer because
                     * it might be longer than the recomposition end result,
                     * or the destination buffer may be too short or missing
                     *
                     * note that destIndex may be adjusted backwards to account
                     * for source text that passed the quick check but needed to
                     * take part in the recomposition
                     */
                    int decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
                    /*
                     * find the last true starter in [prevStarter..src[
                     * it is either the decomposition of the current character (at prevSrc),
                     * or prevStarter
                     */
                    if(isTrueStarter(norm32, CC_MASK|qcMask, decompQCMask)) {
                        prevStarter=prevSrc;
                    } else {
                        /* adjust destIndex: back out what had been copied with qc "yes" */
                        destIndex-=prevSrc-prevStarter;
                    }

                    /* find the next true starter in [src..limit[ */
                    srcIndex=findNextStarter(src, srcIndex,srcLimit, qcMask,
                                               decompQCMask, minNoMaybe);
                    //args.prevStarter = prevStarter;
                    args.prevCC    = prevCC;
                    //args.destIndex = destIndex;
                    args.length = length;
                    p=composePart(args,prevStarter,src,srcIndex,srcLimit,options,nx);

                    if(p==null) {
                        /* an error occurred (out of memory) */
                        break;
                    }

                    prevCC      = args.prevCC;
                    length      = args.length;

                    /* append the recomposed buffer contents to the destination
                     * buffer */
                    if((destIndex+args.length)<=destLimit) {
                        int i=0;
                        while(i<args.length) {
                            dest[destIndex++]=p[i++];
                            --length;
                        }
                    } else {
                        /* buffer overflow */
                        /* keep incrementing the destIndex for preflighting */
                        destIndex+=length;
                    }

                    prevStarter=srcIndex;
                    continue;
                }
            }

            /* append the single code point (c, c2) to the destination buffer */
            if((destIndex+length)<=destLimit) {
                if(cc!=0 && cc<prevCC) {
                    /* (c, c2) is out of order with respect to the preceding
                     * text */
                    int reorderSplit= destIndex;
                    destIndex+=length;
                    prevCC=insertOrdered(dest,reorderStartIndex, reorderSplit,
                                         destIndex, c, c2, cc);
                } else {
                    /* just append (c, c2) */
                    dest[destIndex++]=c;
                    if(c2!=0) {
                        dest[destIndex++]=c2;
                    }
                    prevCC=cc;
                }
            } else {
                /* buffer overflow */
                /* keep incrementing the destIndex for preflighting */
                destIndex+=length;
                prevCC=cc;
            }
        }

        return destIndex - destStart;
    }

    public static int getCombiningClass(int c) {
        long norm32;
        norm32=getNorm32(c);
        return (char)((norm32>>CC_SHIFT)&0xFF);
    }

    public static boolean isFullCompositionExclusion(int c) {
        if(isFormatVersion_2_1) {
            int aux =AuxTrieImpl.auxTrie.getCodePointValue(c);
            return (boolean)((aux & AUX_COMP_EX_MASK)!=0);
        } else {
            return false;
        }
    }

    public static boolean isCanonSafeStart(int c) {
        if(isFormatVersion_2_1) {
            int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
            return (boolean)((aux & AUX_UNSAFE_MASK)==0);
        } else {
            return false;
        }
    }

    /* Is c an NF<mode>-skippable code point? See unormimp.h. */
    public static boolean isNFSkippable(int c, NormalizerBase.Mode mode, long mask) {
        long /*unsigned int*/ norm32;
        mask = mask & UNSIGNED_INT_MASK;
        char aux;

        /* check conditions (a)..(e), see unormimp.h */
        norm32 = getNorm32(c);

        if((norm32&mask)!=0) {
            return false; /* fails (a)..(e), not skippable */
        }

        if(mode == NormalizerBase.NFD || mode == NormalizerBase.NFKD || mode == NormalizerBase.NONE){
            return true; /* NF*D, passed (a)..(c), is skippable */
        }
        /* check conditions (a)..(e), see unormimp.h */

        /* NF*C/FCC, passed (a)..(e) */
        if((norm32& QC_NFD)==0) {
            return true; /* no canonical decomposition, is skippable */
        }

        /* check Hangul syllables algorithmically */
        if(isNorm32HangulOrJamo(norm32)) {
            /* Jamo passed (a)..(e) above, must be Hangul */
            return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */
        }

        /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
        /* NF*C, test (f) flag */
        if(!isFormatVersion_2_2) {
            return false; /* no (f) data, say not skippable to be safe */
        }


        aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
        return (aux&AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */

        /* } else { FCC, test fcd<=1 instead of the above } */
    }

    public static UnicodeSet addPropertyStarts(UnicodeSet set) {
        int c;

        /* add the start code point of each same-value range of each trie */
        //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
        TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie);
        RangeValueIterator.Element normResult = new RangeValueIterator.Element();

        while(normIter.next(normResult)){
            set.add(normResult.start);
        }

        //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
        TrieIterator fcdIter  = new TrieIterator(FCDTrieImpl.fcdTrie);
        RangeValueIterator.Element fcdResult = new RangeValueIterator.Element();

        while(fcdIter.next(fcdResult)){
            set.add(fcdResult.start);
        }

        if(isFormatVersion_2_1){
            //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
            TrieIterator auxIter  = new TrieIterator(AuxTrieImpl.auxTrie);
            RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
            while(auxIter.next(auxResult)){
                set.add(auxResult.start);
            }
        }
        /* add Hangul LV syllables and LV+1 because of skippables */
        for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
            set.add(c);
            set.add(c+1);
        }
        set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
        return set; // for chaining
    }

    /**
     * Internal API, used in UCharacter.getIntPropertyValue().
     * @internal
     * @param c code point
     * @param modeValue numeric value compatible with Mode
     * @return numeric value compatible with QuickCheck
     */
    public static final int quickCheck(int c, int modeValue) {
        final int qcMask[/*UNORM_MODE_COUNT*/]={
            0, 0, QC_NFD, QC_NFKD, QC_NFC, QC_NFKC
        };

        int norm32=(int)getNorm32(c)&qcMask[modeValue];

        if(norm32==0) {
            return 1; // YES
        } else if((norm32&QC_ANY_NO)!=0) {
            return 0; // NO
        } else /* _NORM_QC_ANY_MAYBE */ {
            return 2; // MAYBE;
        }
    }

    private static int strCompare(char[] s1, int s1Start, int s1Limit,
                                  char[] s2, int s2Start, int s2Limit,
                                  boolean codePointOrder) {

        int start1, start2, limit1, limit2;

        char c1, c2;

        /* setup for fix-up */
        start1=s1Start;
        start2=s2Start;

        int length1, length2;

        length1 = s1Limit - s1Start;
        length2 = s2Limit - s2Start;

        int lengthResult;

        if(length1<length2) {
            lengthResult=-1;
            limit1=start1+length1;
        } else if(length1==length2) {
            lengthResult=0;
            limit1=start1+length1;
        } else /* length1>length2 */ {
            lengthResult=1;
            limit1=start1+length2;
        }

        if(s1==s2) {
            return lengthResult;
        }

        for(;;) {
            /* check pseudo-limit */
            if(s1Start==limit1) {
                return lengthResult;
            }

            c1=s1[s1Start];
            c2=s2[s2Start];
            if(c1!=c2) {
                break;
            }
            ++s1Start;
            ++s2Start;
        }

        /* setup for fix-up */
        limit1=start1+length1;
        limit2=start2+length2;


        /* if both values are in or above the surrogate range, fix them up */
        if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
            /* subtract 0x2800 from BMP code points to make them smaller than
             *  supplementary ones */
            if(
                ( c1<=0xdbff && (s1Start+1)!=limit1 &&
                  UTF16.isTrailSurrogate(s1[(s1Start+1)])
                ) ||
                ( UTF16.isTrailSurrogate(c1) && start1!=s1Start &&
                  UTF16.isLeadSurrogate(s1[(s1Start-1)])
                )
            ) {
                /* part of a surrogate pair, leave >=d800 */
            } else {
                /* BMP code point - may be surrogate code point - make <d800 */
                c1-=0x2800;
            }

            if(
                ( c2<=0xdbff && (s2Start+1)!=limit2 &&
                  UTF16.isTrailSurrogate(s2[(s2Start+1)])
                ) ||
                ( UTF16.isTrailSurrogate(c2) && start2!=s2Start &&
                  UTF16.isLeadSurrogate(s2[(s2Start-1)])
                )
            ) {
                /* part of a surrogate pair, leave >=d800 */
            } else {
                /* BMP code point - may be surrogate code point - make <d800 */
                c2-=0x2800;
            }
        }

        /* now c1 and c2 are in UTF-32-compatible order */
        return (int)c1-(int)c2;
    }


    /*
     * Status of tailored normalization
     *
     * This was done initially for investigation on Unicode public review issue 7
     * (http://www.unicode.org/review/). See Jitterbug 2481.
     * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
     * a permanent feature in ICU 2.6 in support of IDNA which requires true
     * Unicode 3.2 normalization.
     * (NormalizationCorrections are rolled into IDNA mapping tables.)
     *
     * Tailored normalization as implemented here allows to "normalize less"
     * than full Unicode normalization would.
     * Based internally on a UnicodeSet of code points that are
     * "excluded from normalization", the normalization functions leave those
     * code points alone ("inert"). This means that tailored normalization
     * still transforms text into a canonically equivalent form.
     * It does not add decompositions to code points that do not have any or
     * change decomposition results.
     *
     * Any function that searches for a safe boundary has not been touched,
     * which means that these functions will be over-pessimistic when
     * exclusions are applied.
     * This should not matter because subsequent checks and normalizations
     * do apply the exclusions; only a little more of the text may be processed
     * than necessary under exclusions.
     *
     * Normalization exclusions have the following effect on excluded code points c:
     * - c is not decomposed
     * - c is not a composition target
     * - c does not combine forward or backward for composition
     *   except that this is not implemented for Jamo
     * - c is treated as having a combining class of 0
     */

    /*
     * Constants for the bit fields in the options bit set parameter.
     * These need not be public.
     * A user only needs to know the currently assigned values.
     * The number and positions of reserved bits per field can remain private.
     */
    private static final int OPTIONS_NX_MASK=0x1f;
    private static final int OPTIONS_UNICODE_MASK=0xe0;
    public  static final int OPTIONS_SETS_MASK=0xff;
    private static final int OPTIONS_UNICODE_SHIFT=5;
    private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1];

    /* Constants for options flags for normalization.*/

    /**
     * Options bit 0, do not decompose Hangul syllables.
     * @draft ICU 2.6
     */
    private static final int NX_HANGUL = 1;
    /**
     * Options bit 1, do not decompose CJK compatibility characters.
     * @draft ICU 2.6
     */
    private static final int NX_CJK_COMPAT=2;
    /**
     * Options bit 8, use buggy recomposition described in
     * Unicode Public Review Issue #29
     * at http://www.unicode.org/review/resolved-pri.html#pri29
     *
     * Used in IDNA implementation according to strict interpretation
     * of IDNA definition based on Unicode 3.2 which predates PRI #29.
     *
     * See ICU4C unormimp.h
     *
     * @draft ICU 3.2
     */
    public static final int BEFORE_PRI_29=0x100;

    /*
     * The following options are used only in some composition functions.
     * They use bits 12 and up to preserve lower bits for the available options
     * space in unorm_compare() -
     * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
     */

    /** Options bit 12, for compatibility vs. canonical decomposition. */
    public static final int OPTIONS_COMPAT=0x1000;
    /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
    public static final int OPTIONS_COMPOSE_CONTIGUOUS=0x2000;

    /* normalization exclusion sets --------------------------------------------- */

    /*
     * Normalization exclusion UnicodeSets are used for tailored normalization;
     * see the comment near the beginning of this file.
     *
     * By specifying one or several sets of code points,
     * those code points become inert for normalization.
     */
    private static final synchronized UnicodeSet internalGetNXHangul() {
        /* internal function, does not check for incoming U_FAILURE */

        if(nxCache[NX_HANGUL]==null) {
             nxCache[NX_HANGUL]=new UnicodeSet(0xac00, 0xd7a3);
        }
        return nxCache[NX_HANGUL];
    }

    private static final synchronized UnicodeSet internalGetNXCJKCompat() {
        /* internal function, does not check for incoming U_FAILURE */

        if(nxCache[NX_CJK_COMPAT]==null) {

            /* build a set from [CJK Ideographs]&[has canonical decomposition] */
            UnicodeSet set, hasDecomp;

            set=new UnicodeSet("[:Ideographic:]");

            /* start with an empty set for [has canonical decomposition] */
            hasDecomp=new UnicodeSet();

            /* iterate over all ideographs and remember which canonically decompose */
            UnicodeSetIterator it = new UnicodeSetIterator(set);
            int start, end;
            long norm32;

            while(it.nextRange() && (it.codepoint != UnicodeSetIterator.IS_STRING)) {
                start=it.codepoint;
                end=it.codepointEnd;
                while(start<=end) {
                    norm32 = getNorm32(start);
                    if((norm32 & QC_NFD)>0) {
                        hasDecomp.add(start);
                    }
                    ++start;
                }
            }

            /* hasDecomp now contains all ideographs that decompose canonically */
             nxCache[NX_CJK_COMPAT]=hasDecomp;

        }

        return nxCache[NX_CJK_COMPAT];
    }

    private static final synchronized UnicodeSet internalGetNXUnicode(int options) {
        options &= OPTIONS_UNICODE_MASK;
        if(options==0) {
            return null;
        }

        if(nxCache[options]==null) {
            /* build a set with all code points that were not designated by the specified Unicode version */
            UnicodeSet set = new UnicodeSet();

            switch(options) {
            case NormalizerBase.UNICODE_3_2:
                set.applyPattern("[:^Age=3.2:]");
                break;
            default:
                return null;
            }

            nxCache[options]=set;
        }

        return nxCache[options];
    }

    /* Get a decomposition exclusion set. The data must be loaded. */
    private static final synchronized UnicodeSet internalGetNX(int options) {
        options&=OPTIONS_SETS_MASK;

        if(nxCache[options]==null) {
            /* return basic sets */
            if(options==NX_HANGUL) {
                return internalGetNXHangul();
            }
            if(options==NX_CJK_COMPAT) {
                return internalGetNXCJKCompat();
            }
            if((options & OPTIONS_UNICODE_MASK)!=0 && (options & OPTIONS_NX_MASK)==0) {
                return internalGetNXUnicode(options);
            }

            /* build a set from multiple subsets */
            UnicodeSet set;
            UnicodeSet other;

            set=new UnicodeSet();


            if((options & NX_HANGUL)!=0 && null!=(other=internalGetNXHangul())) {
                set.addAll(other);
            }
            if((options&NX_CJK_COMPAT)!=0 && null!=(other=internalGetNXCJKCompat())) {
                set.addAll(other);
            }
            if((options&OPTIONS_UNICODE_MASK)!=0 && null!=(other=internalGetNXUnicode(options))) {
                set.addAll(other);
            }

               nxCache[options]=set;
        }
        return nxCache[options];
    }

    public static final UnicodeSet getNX(int options) {
        if((options&=OPTIONS_SETS_MASK)==0) {
            /* incoming failure, or no decomposition exclusions requested */
            return null;
        } else {
            return internalGetNX(options);
        }
    }

    private static final boolean nx_contains(UnicodeSet nx, int c) {
        return nx!=null && nx.contains(c);
    }

    private static final boolean nx_contains(UnicodeSet nx, char c, char c2) {
        return nx!=null && nx.contains(c2==0 ? c : UCharacterProperty.getRawSupplementary(c, c2));
    }

/*****************************************************************************/

    /**
     * Get the canonical decomposition
     * sherman  for ComposedCharIter
     */

    public static int getDecompose(int chars[], String decomps[]) {
        DecomposeArgs args = new DecomposeArgs();
        int length=0;
        long norm32 = 0;
        int ch = -1;
        int index = 0;
        int i = 0;

        while (++ch < 0x2fa1e) {   //no cannoical above 0x3ffff
            //TBD !!!! the hack code heres save us about 50ms for startup
            //need a better solution/lookup
            if (ch == 0x30ff)
                ch = 0xf900;
            else if (ch == 0x10000)
                ch = 0x1d15e;
            else if (ch == 0x1d1c1)
                ch = 0x2f800;

            norm32 = NormalizerImpl.getNorm32(ch);
            if((norm32 & QC_NFD)!=0 && i < chars.length) {
                chars[i] = ch;
                index = decompose(norm32, args);
                decomps[i++] = new String(extraData,index, args.length);
            }
        }
        return i;
    }

    //------------------------------------------------------
    // special method for Collation
    //------------------------------------------------------
    private static boolean needSingleQuotation(char c) {
        return (c >= 0x0009 && c <= 0x000D) ||
               (c >= 0x0020 && c <= 0x002F) ||
               (c >= 0x003A && c <= 0x0040) ||
               (c >= 0x005B && c <= 0x0060) ||
               (c >= 0x007B && c <= 0x007E);
    }

    public static String canonicalDecomposeWithSingleQuotation(String string) {
        char[] src = string.toCharArray();
        int    srcIndex = 0;
        int    srcLimit = src.length;
        char[] dest = new char[src.length * 3];  //MAX_BUF_SIZE_DECOMPOSE = 3
        int    destIndex = 0;
        int    destLimit = dest.length;

        char[] buffer = new char[3];
        int prevSrc;
        long norm32;
        int ccOrQCMask;
        int qcMask = QC_NFD;
        int reorderStartIndex, length;
        char c, c2;
        char minNoMaybe = (char)indexes[INDEX_MIN_NFD_NO_MAYBE];
        int cc, prevCC, trailCC;
        char[] p;
        int pStart;


        // initialize
        ccOrQCMask = CC_MASK | qcMask;
        reorderStartIndex = 0;
        prevCC = 0;
        norm32 = 0;
        c = 0;
        pStart = 0;

        cc = trailCC = -1; // initialize to bogus value
        for(;;) {
            prevSrc=srcIndex;
            //quick check (1)less than minNoMaybe (2)no decomp (3)hangual
            while (srcIndex != srcLimit &&
                   (( c = src[srcIndex]) < minNoMaybe ||
                    ((norm32 = getNorm32(c)) & ccOrQCMask) == 0 ||
                    ( c >= '\uac00' && c <= '\ud7a3'))){

                prevCC = 0;
                ++srcIndex;
            }

            // copy these code units all at once
            if (srcIndex != prevSrc) {
                length = (int)(srcIndex - prevSrc);
                if ((destIndex + length) <= destLimit) {
                    System.arraycopy(src,prevSrc,dest,destIndex,length);
                }

                destIndex += length;
                reorderStartIndex = destIndex;
            }

            // end of source reached?
            if(srcIndex == srcLimit) {
                break;
            }
            // c already contains *src and norm32 is set for it, increment src
            ++srcIndex;

            if(isNorm32Regular(norm32)) {
                c2 = 0;
                length = 1;
            } else {
                // c is a lead surrogate, get the real norm32
                if(srcIndex != srcLimit &&
                    Character.isLowSurrogate(c2 = src[srcIndex])) {
                        ++srcIndex;
                        length = 2;
                        norm32 = getNorm32FromSurrogatePair(norm32, c2);
                } else {
                    c2 = 0;
                    length = 1;
                    norm32 = 0;
                }
            }

            // get the decomposition and the lead and trail cc's
            if((norm32 & qcMask) == 0) {
                // c does not decompose
                cc = trailCC = (int)((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT));
                p = null;
                pStart = -1;
            } else {
                DecomposeArgs arg = new DecomposeArgs();
                // c decomposes, get everything from the variable-length
                // extra data
                pStart = decompose(norm32, qcMask, arg);
                p = extraData;
                length = arg.length;
                cc = arg.cc;
                trailCC = arg.trailCC;
                if(length == 1) {
                    // fastpath a single code unit from decomposition
                    c = p[pStart];
                    c2 = 0;
                    p = null;
                    pStart = -1;
                }
            }

            if((destIndex + length * 3) >= destLimit) {  // 2 SingleQuotations
                // buffer overflow
                char[] tmpBuf = new char[destLimit * 2];
                System.arraycopy(dest, 0, tmpBuf, 0, destIndex);
                dest = tmpBuf;
                destLimit = dest.length;
            }
            // append the decomposition to the destination buffer, assume length>0
            {
                int reorderSplit = destIndex;
                if(p == null) {
                    // fastpath: single code point
                    if (needSingleQuotation(c)) {
                        //if we need single quotation, no need to consider "prevCC"
                        //and it must NOT be a supplementary pair
                        dest[destIndex++] = '\'';
                        dest[destIndex++] = c;
                        dest[destIndex++] = '\'';
                        trailCC = 0;
                    } else if(cc != 0 && cc < prevCC) {
                        // (c, c2) is out of order with respect to the preceding
                        //  text
                        destIndex += length;
                        trailCC = insertOrdered(dest,reorderStartIndex,
                                                reorderSplit, destIndex, c, c2, cc);
                    } else {
                        // just append (c, c2)
                        dest[destIndex++] = c;
                        if(c2 != 0) {
                            dest[destIndex++] = c2;
                        }
                    }
                } else {
                    // general: multiple code points (ordered by themselves)
                    // from decomposition
                    if (needSingleQuotation(p[pStart])) {
                        dest[destIndex++] = '\'';
                        dest[destIndex++] = p[pStart++];
                        dest[destIndex++] = '\'';
                        length--;
                        do {
                            dest[destIndex++] = p[pStart++];
                        } while(--length > 0);
                    } else
                    if(cc != 0 && cc < prevCC) {
                        destIndex += length;
                        trailCC = mergeOrdered(dest,reorderStartIndex,
                                               reorderSplit,p, pStart,pStart+length);
                    } else {
                        // just append the decomposition
                        do {
                            dest[destIndex++] = p[pStart++];
                        } while(--length > 0);
                    }
                }
            }
            prevCC = trailCC;
            if(prevCC == 0) {
                reorderStartIndex = destIndex;
            }
        }
        return new String(dest, 0, destIndex);
    }

    //------------------------------------------------------
    // mapping method for IDNA/StringPrep
    //------------------------------------------------------

    /*
     * Normalization using NormalizerBase.UNICODE_3_2 option supports Unicode
     * 3.2 normalization with Corrigendum 4 corrections. However, normalization
     * without the corrections is necessary for IDNA/StringPrep support.
     * This method is called when NormalizerBase.UNICODE_3_2_0_ORIGINAL option
     * (= sun.text.Normalizer.UNICODE_3_2) is used and normalizes five
     * characters in Corrigendum 4 before normalization in order to avoid
     * incorrect normalization.
     * For the Corrigendum 4 issue, refer
     *   http://www.unicode.org/versions/corrigendum4.html
     */

    /*
     * Option used in NormalizerBase.UNICODE_3_2_0_ORIGINAL.
     */
    public static final int WITHOUT_CORRIGENDUM4_CORRECTIONS=0x40000;

    private static final char[][] corrigendum4MappingTable = {
        {'\uD844', '\uDF6A'},  // 0x2F868
        {'\u5F33'},            // 0x2F874
        {'\u43AB'},            // 0x2F91F
        {'\u7AAE'},            // 0x2F95F
        {'\u4D57'}};           // 0x2F9BF

    /*
     * Removing Corrigendum 4 fix
     * @return normalized text
     */
    public static String convert(String str) {
        if (str == null) {
            return null;
        }

        int ch  = UCharacterIterator.DONE;
        StringBuffer dest = new StringBuffer();
        UCharacterIterator iter = UCharacterIterator.getInstance(str);

        while ((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
            switch (ch) {
            case 0x2F868:
                dest.append(corrigendum4MappingTable[0]);
                break;
            case 0x2F874:
                dest.append(corrigendum4MappingTable[1]);
                break;
            case 0x2F91F:
                dest.append(corrigendum4MappingTable[2]);
                break;
            case 0x2F95F:
                dest.append(corrigendum4MappingTable[3]);
                break;
            case 0x2F9BF:
                dest.append(corrigendum4MappingTable[4]);
                break;
            default:
                UTF16.append(dest,ch);
                break;
            }
        }

        return dest.toString();
    }
}
Interfaces

Classes

NormalizerImpl