Logo Search packages:      
Sourcecode: uimaj version File versions  Download package

XmlDetagger.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.uima.tools.components;

import java.io.InputStream;
import java.net.URL;
import java.util.Iterator;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.uima.UIMAFramework;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.InvalidXMLException;
import org.apache.uima.util.XMLInputSource;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * A multi-sofa annotator that does XML detagging. Reads XML data from the input Sofa (named
 * "xmlDocument"); this data can be stored in the CAS as a string or array, or it can be a URI to a
 * remote file. The XML is parsed using the JVM's default parser, and the plain-text content is
 * written to a new sofa called "plainTextDocument".
 */
00051 public class XmlDetagger extends CasAnnotator_ImplBase {
  /**
   * Name of optional configuration parameter that contains the name of an XML tag that appears in
   * the input file. Only text that falls within this XML tag will be considered part of the
   * "document" that it is added to the CAS by this CAS Initializer. If not specified, the entire
   * file will be considered the document.
   */
00058   public static final String PARAM_TEXT_TAG = "XmlTagContainingText";
  
  private SAXParserFactory parserFactory = SAXParserFactory.newInstance();

  private Type sourceDocInfoType;

  private String mXmlTagContainingText = null;

    
00067   public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);
    // Get config param setting
    mXmlTagContainingText  = (String) getContext().getConfigParameterValue(PARAM_TEXT_TAG);
  }

00073   public void typeSystemInit(TypeSystem aTypeSystem) throws AnalysisEngineProcessException {
    sourceDocInfoType = aTypeSystem.getType("org.apache.uima.examples.SourceDocumentInformation");
  }

00077   public void process(CAS aCAS) throws AnalysisEngineProcessException {
    // get handle to CAS view containing XML document
    CAS xmlCas = aCAS.getView("xmlDocument");
    InputStream xmlStream = xmlCas.getSofa().getSofaDataStream();

    // parse with detag handler
    DetagHandler handler = new DetagHandler();
    try {
      SAXParser parser = parserFactory.newSAXParser();
      parser.parse(xmlStream, handler);
    } catch (Exception e) {
      throw new AnalysisEngineProcessException(e);
    }

    // create the plain text view and set its document text
    CAS plainTextView = aCAS.createView("plainTextDocument");
    plainTextView.setDocumentText(handler.getDetaggedText());
    plainTextView.setDocumentLanguage(aCAS.getView("_InitialView").getDocumentLanguage());

    // Index the SourceDocumentInformation object, if there is one, in the new sofa.
    // This is needed by the SemanticSearchCasIndexer
    Iterator iter = xmlCas.getAnnotationIndex(sourceDocInfoType).iterator();
    if (iter.hasNext()) {
      FeatureStructure sourceDocInfoFs = (FeatureStructure) iter.next();
      plainTextView.getIndexRepository().addFS(sourceDocInfoFs);

    }

  }
  
  /**
   * Parses and returns the descriptor for this Analysis Gnein. The descriptor is stored in the
   * uima-core.jar file and located using the ClassLoader.
   * 
   * @return an object containing all of the information parsed from the descriptor.
   * 
   * @throws InvalidXMLException
   *           if the descriptor is invalid or missing
   */
00116   public static AnalysisEngineDescription getDescription() throws InvalidXMLException {
    InputStream descStream = XmlDetagger.class
            .getResourceAsStream("XmlDetagger.xml");
    return UIMAFramework.getXMLParser().parseAnalysisEngineDescription(
            new XMLInputSource(descStream, null));
  }  

  public static URL getDescriptorURL() {
    return XmlDetagger.class.getResource("XmlDetagger.xml");
  }  

00127   class DetagHandler extends DefaultHandler {
    private StringBuffer detaggedText = new StringBuffer();
    private boolean insideTextTag;

    public DetagHandler() {
      insideTextTag = (mXmlTagContainingText == null);
    }
        
    public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
      if (qName.equalsIgnoreCase(mXmlTagContainingText)) {
        insideTextTag = true;
      }
    }

    public void endElement(String uri, String localName, String qName) throws SAXException {
      if (qName.equalsIgnoreCase(mXmlTagContainingText)) {
        insideTextTag = false;
      }
    }

    public void characters(char[] ch, int start, int length) throws SAXException {
      if (insideTextTag) {
        detaggedText.append(ch, start, length);        
      }
    }
    
    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
      if (insideTextTag) {
        detaggedText.append(ch, start, length);        
      }
    }

    String getDetaggedText() {
      return detaggedText.toString();
    }
  }
}

Generated by  Doxygen 1.6.0   Back to index