public class

WordlistLoader

extends Object
package org.apache.lucene.analysis;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.HashSet;

/**
 * Loader for text files that represent a list of stopwords.
 */
public class WordlistLoader {

  /**
   * Loads a text file and adds every line as an entry to a HashSet (omitting
   * leading and trailing whitespace). Every line of the file should contain only
   * one word. The words need to be in lowercase if you make use of an
   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
   *
   * @param wordfile File containing the wordlist
   * @return A HashSet with the file's words
   */
  public static HashSet<String> getWordSet(File wordfile) throws IOException {
    HashSet<String> result = new HashSet<String>();
    FileReader reader = null;
    try {
      reader = new FileReader(wordfile);
      result = getWordSet(reader);
    }
    finally {
      if (reader != null)
        reader.close();
    }
    return result;
  }

  /**
   * Loads a text file and adds every non-comment line as an entry to a HashSet (omitting
   * leading and trailing whitespace). Every line of the file should contain only
   * one word. The words need to be in lowercase if you make use of an
   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
   *
   * @param wordfile File containing the wordlist
   * @param comment The comment string to ignore
   * @return A HashSet with the file's words
   */
  public static HashSet<String> getWordSet(File wordfile, String comment) throws IOException {
    HashSet<String> result = new HashSet<String>();
    FileReader reader = null;
    try {
      reader = new FileReader(wordfile);
      result = getWordSet(reader, comment);
    }
    finally {
      if (reader != null)
        reader.close();
    }
    return result;
  }


  /**
   * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting
   * leading and trailing whitespace). Every line of the Reader should contain only
   * one word. The words need to be in lowercase if you make use of an
   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
   *
   * @param reader Reader containing the wordlist
   * @return A HashSet with the reader's words
   */
  public static HashSet<String> getWordSet(Reader reader) throws IOException {
    HashSet<String> result = new HashSet<String>();
    BufferedReader br = null;
    try {
      if (reader instanceof BufferedReader) {
        br = (BufferedReader) reader;
      } else {
        br = new BufferedReader(reader);
      }
      String word = null;
      while ((word = br.readLine()) != null) {
        result.add(word.trim());
      }
    }
    finally {
      if (br != null)
        br.close();
    }
    return result;
  }

  /**
   * Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting
   * leading and trailing whitespace). Every line of the Reader should contain only
   * one word. The words need to be in lowercase if you make use of an
   * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
   *
   * @param reader Reader containing the wordlist
   * @param comment The string representing a comment.
   * @return A HashSet with the reader's words
   */
  public static HashSet<String> getWordSet(Reader reader, String comment) throws IOException {
    HashSet<String> result = new HashSet<String>();
    BufferedReader br = null;
    try {
      if (reader instanceof BufferedReader) {
        br = (BufferedReader) reader;
      } else {
        br = new BufferedReader(reader);
      }
      String word = null;
      while ((word = br.readLine()) != null) {
        if (word.startsWith(comment) == false){
          result.add(word.trim());
        }
      }
    }
    finally {
      if (br != null)
        br.close();
    }
    return result;
  }



  /**
   * Reads a stem dictionary. Each line contains:
   * <pre>word<b>\t</b>stem</pre>
   * (i.e. two tab seperated words)
   *
   * @return stem dictionary that overrules the stemming algorithm
   * @throws IOException 
   */
  public static HashMap<String, String> getStemDict(File wordstemfile) throws IOException {
    if (wordstemfile == null)
      throw new NullPointerException("wordstemfile may not be null");
    HashMap<String, String> result = new HashMap<String, String>();
    BufferedReader br = null;
    FileReader fr = null;
    try {
      fr = new FileReader(wordstemfile);
      br = new BufferedReader(fr);
      String line;
      while ((line = br.readLine()) != null) {
        String[] wordstem = line.split("\t", 2);
        result.put(wordstem[0], wordstem[1]);
      }
    } finally {
      if (fr != null)
        fr.close();
      if (br != null)
        br.close();
    }
    return result;
  }

}