Logo Search packages:      
Sourcecode: uimaj version File versions  Download package

PersonTitleAnnotator.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.uima.examples.cas;

import java.util.Arrays;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.analysis_engine.annotator.AnnotatorContext;
import org.apache.uima.analysis_engine.annotator.AnnotatorInitializationException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;

/**
 * An example annotator that discovers Person Titles in text and classifies them into three
 * categories - Civilian (e.g. Mr.,Ms.), Military (e.g. Lt. Col.) , and Government (e.g. Gov.,
 * Sen.). The titles are detected using simple string matching. The strings that are matched are
 * determined by the <code>CivilianTitles</code>, <code>MilitaryTitles</code>, and
 * <code>GovernmentTitles</code> configuration parameters.
 * <p>
 * If the <code>ContainingAnnotationType</code> parameter is specified, this annotator will only
 * look for titles within existing annotations of that type. This feature can be used, for example,
 * to only match person titles within existing Person Name annotations, discovered by some annotator
 * that has run previously.
 * 
 * 
 */
00053 public class PersonTitleAnnotator extends CasAnnotator_ImplBase {
  /**
   * The Type of Annotation that we will be creating when we find a match.
   */
00057   private Type mPersonTitleType;

  /**
   * The Annotation Type within which we will search for Person Titles (optional).
   */
00062   private Type mContainingType;

  /**
   * The Feature representing the kind of PersonTitle - civilian, military, or government.
   */
00067   private Feature mPersonTitleKindFeature;

  /**
   * The list of civilian titles, read from the CivilianTitles configuration parameter.
   */
00072   private String[] mCivilianTitles;

  /**
   * The list of military titles, read from the MilitaryTitles configuration parameter.
   */
00077   private String[] mMilitaryTitles;

  /**
   * The list of government titles, read from the GovernmentTitles configuration parameter.
   */
00082   private String[] mGovernmentTitles;

  /**
   * Performs initialization logic. This implementation just reads values for the configuration
   * parameters.
   * 
   * @see org.apache.uima.analysis_engine.annotator.BaseAnnotator#initialize(AnnotatorContext)
   */
00090   public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);

    // read configuration parameter values
    mCivilianTitles = (String[]) getContext().getConfigParameterValue("CivilianTitles");
    mMilitaryTitles = (String[]) getContext().getConfigParameterValue("MilitaryTitles");
    mGovernmentTitles = (String[]) getContext().getConfigParameterValue("GovernmentTitles");

    // write log messages
    Logger logger = getContext().getLogger();
    logger.log(Level.CONFIG, "PersonTitleAnnotator initialized");
    logger.log(Level.CONFIG, "CivilianTitles = " + Arrays.asList(mCivilianTitles));
    logger.log(Level.CONFIG, "MilitaryTitles = " + Arrays.asList(mMilitaryTitles));
    logger.log(Level.CONFIG, "GovernmentTitles = " + Arrays.asList(mGovernmentTitles));
  }

  /**
   * Called whenever the CAS type system changes. Acquires references to Types and Features.
   * 
   * @see org.apache.uima.analysis_engine.annotator.BaseAnnotator#typeSystemInit(TypeSystem)
   */
00111   public void typeSystemInit(TypeSystem aTypeSystem) throws AnalysisEngineProcessException {
    // Get a reference to the "PersonTitle" Type
    mPersonTitleType = aTypeSystem.getType("example.PersonTitle");
    if (mPersonTitleType == null) {
      throw new AnalysisEngineProcessException(AnnotatorInitializationException.TYPE_NOT_FOUND,
              new Object[] { getClass().getName(), "example.PersonTitle" });
    }

    // Get a reference to the "Kind" Feature
    mPersonTitleKindFeature = mPersonTitleType.getFeatureByBaseName("Kind");
    if (mPersonTitleKindFeature == null) {
      throw new AnalysisEngineProcessException(AnnotatorInitializationException.FEATURE_NOT_FOUND,
              new Object[] { getClass().getName(), "example.PersonTitle:Kind" });
    }

    // Get the value for the "ContainingType" parameter if there is one
    String containingTypeName = (String) getContext().getConfigParameterValue(
            "ContainingAnnotationType");
    if (containingTypeName != null) {
      mContainingType = aTypeSystem.getType(containingTypeName);
      if (mContainingType == null) {
        throw new AnalysisEngineProcessException(AnnotatorInitializationException.TYPE_NOT_FOUND,
                new Object[] { getClass().getName(), containingTypeName });
      }
    }
  }

  /**
   * Annotates a document. This annotator searches for person titles using simple string matching.
   * 
   * @param aCAS
   *          CAS containing document text and previously discovered annotations, and to which new
   *          annotations are to be written.
   * 
   * @see CasAnnotator_ImplBase#process(CAS)
   */
00147   public void process(CAS aCAS) throws AnalysisEngineProcessException {
    try {
      // If the ResultSpec doesn't include the PersonTitle type, we have
      // nothing to do.
      if (!getResultSpecification().containsType("example.PersonTitle",aCAS.getDocumentLanguage())) {
        return;
      }

      if (mContainingType == null) {
        // Search the whole document for PersonTitle annotations
        String text = aCAS.getDocumentText();
        annotateRange(aCAS, text, 0);
      } else {
        // Search only within annotations of type mContainingType

        // Get an iterator over the annotations of type mContainingType.
        FSIterator it = aCAS.getAnnotationIndex(mContainingType).iterator();
        // Loop over the iterator.
        while (it.isValid()) {
          // Get the next annotation from the iterator
          AnnotationFS annot = (AnnotationFS) it.get();
          // Get text covered by this annotation
          String coveredText = annot.getCoveredText();
          // Get begin position of this annotation
          int annotBegin = annot.getBegin();
          // search for matches within this
          annotateRange(aCAS, coveredText, annotBegin);
          // Advance the iterator.
          it.moveToNext();
        }
      }
    } catch (Exception e) {
      throw new AnalysisEngineProcessException(e);
    }
  }

  /**
   * A utility method that searches a part of the document for Person Titles.
   * 
   * @param aCAS
   *          the CAS in which to create new annotations
   * @param aText
   *          the substring of the document text within which to search
   * @param aBeginPos
   *          the position of this substring relative to the start of the document
   */
00193   protected void annotateRange(CAS aCAS, String aText, int aBeginPos) {
    // Search for each of the three types of titles
    annotateRange(aCAS, aText, aBeginPos, "Civilian", mCivilianTitles);
    annotateRange(aCAS, aText, aBeginPos, "Military", mMilitaryTitles);
    annotateRange(aCAS, aText, aBeginPos, "Government", mGovernmentTitles);
  }

  /**
   * A utility method that searches a part of the document for a specific kind of Person Title.
   * 
   * @param aCAS
   *          the CAS in which to create new annotations
   * @param aText
   *          the substring of the document text within which to search
   * @param aBeginPos
   *          the position of this substring relative to the start of the document
   * @param aTitleType
   *          the type of title to look for. This becomes the value of the <code>Kind</code>
   *          feature.
   * @param aTitles
   *          the exact strings to look for in the document
   * 
   */
00216   protected void annotateRange(CAS aCAS, String aText, int aBeginPos, String aTitleType,
          String[] aTitles) {
    // Loop over the matchStrings.
    for (int i = 0; i < aTitles.length; i++) {
      // logger.log("Looking for string: " + matchStrings[i]);
      // Find a first match, if it exists.
      int start = aText.indexOf(aTitles[i]);
      // Keep going while there are matches in the text.
      while (start >= 0) {
        // Set the end position (start + length of string).
        int end = start + aTitles[i].length();
        // Compute absolute position of annotation in document
        int absStart = aBeginPos + start;
        int absEnd = aBeginPos + end;
        // Write log message
        getContext().getLogger().log(Level.FINER,
                "Found \"" + aTitles[i] + "\" at (" + absStart + "," + absEnd + ")");
        // Create a new annotation for the most recently discovered match.
        createAnnotation(aCAS, absStart, absEnd, aTitleType);
        // Look for the next match, starting after the previous match.
        start = aText.indexOf(aTitles[i], end);
      }
    }
  }

  /**
   * Creates an PersonTitle annotation in the CAS.
   * 
   * @param aCAS
   *          the CAS in which to create the annotation
   * @param aBeginPos
   *          the begin position of the annotation relative to the start of the document
   * @param aEndPos
   *          the end position of the annotation relative to the start of the document. (Note that,
   *          as in the Java string functions, the end position is one past the last character in
   *          the annotation, so that (end - begin) = length.
   * @param aTitleType
   *          the type of person title. This becomes the value of the <code>Kind</code> feature.
   */
00255   protected void createAnnotation(CAS aCAS, int aBeginPos, int aEndPos, String aTitleType) {
    AnnotationFS title = aCAS.createAnnotation(mPersonTitleType, aBeginPos, aEndPos);
    // Set the "kind" feature if it's part of the ResultSpec
    if (getResultSpecification().containsFeature("example.PersonTitle:Kind",aCAS.getDocumentLanguage())) {
      title.setStringValue(mPersonTitleKindFeature, aTitleType);
    }
    // Add the annotation to the index.
    aCAS.getIndexRepository().addFS(title);
  }

}

Generated by  Doxygen 1.6.0   Back to index