Logo Search packages:      
Sourcecode: uimaj version File versions  Download package

PersonTitleDBWriterCasConsumer.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.uima.examples.cpe;

import java.io.File;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.sql.Statement;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import org.apache.uima.util.ProcessTrace;

import example.PersonTitle;

/**
 * A simple CAS consumer that creates a Derby (Cloudscape) database in the file system. You can
 * obtain this database from http://incubator.apache.org/derby/ *
 * <p>
 * This CAS Consumer takes one parameters:
 * <ul>
 * <li><code>OutputDirectory</code> - path to directory which is the "System" directory for the
 * derby DB. </li>
 * </ul>
 * 
 * It deletes all the databases at the system location (!!!), Creates a new database (takes the most
 * time - order of 10+ seconds) creates a table in the database to hold instances of the PersonTitle
 * annotation Adds entries for each PersonTitle annotation in each CAS to the database
 * 
 * To use - add derby.jar to the classpath when you start the CPE GUI - run the CPE Gui and select
 * the Name Recognizer and Person Title Annotator aggregate. - a good sample collection reader is
 * the FileSystemCollectionReader, and - a good sample data is the <UIMA_HOME>/examples/data
 * 
 * The processing is set up to handle multiple CASes. The end is indicated by using the
 * CollectionProcessComplete call.
 * 
 * Batching of updates to the database is done. The batch size is set to 50. The larger size takes
 * more Java heap space, but perhaps runs more efficiently.
 * 
 * The Table is populated with a slightly denormalized form of the data: the URI of the document is
 * included with every record.
 * 
 * 
 */
00071 public class PersonTitleDBWriterCasConsumer extends CasConsumer_ImplBase {
  /**
   * Name of configuration parameter that must be set to the path of a directory into which the
   * Derby Database will be written.
   */
00076   public static final String PARAM_OUTPUTDIR = "OutputDirectory";

  public static final int MAX_URI_LENGTH = 80;

  public static final int MAX_TITLE_LENGTH = 20;

  public static final int DB_LOAD_BATCH_SIZE = 50;

  private int batchCounter = DB_LOAD_BATCH_SIZE;

  private File mOutputDir;

  private boolean firstCall = true;

  private static boolean firstEverCall = true;

  private PreparedStatement stmt;

  private Connection con;

  private long startTime;

00098   public void initialize() throws ResourceInitializationException {
    startTime = System.currentTimeMillis();
    System.out
            .println("Time: " + (System.currentTimeMillis() - startTime) + " initialize() called");
    mOutputDir = new File((String) getConfigParameterValue(PARAM_OUTPUTDIR));
    if (!mOutputDir.exists()) {
      mOutputDir.mkdirs();
    }

    // make this the derby home by setting system property
    System.setProperty("derby.system.home", mOutputDir.toString());
    System.out.println("Time: " + (System.currentTimeMillis() - startTime)
            + " DB Writer: Set derby system home to: '" + mOutputDir.toString() + "'");
  }

  /**
   * Processes the CasContainer which was populated by the TextAnalysisEngines. <br>
   * In this case, the CAS is assumed to contain annotations of type PersonTitle, created with the
   * PersonTitleAnnotator. These Annotations are stored in a database table called PersonTitle.
   * 
   * @param aCAS
   *          CasContainer which has been populated by the TAEs
   * 
   * @throws ResourceProcessException
   *           if there is an error in processing the Resource
   * 
   * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS)
   */
00126   public void processCas(CAS aCAS) throws ResourceProcessException {
    System.out.println("Time: " + (System.currentTimeMillis() - startTime)
            + " DB Writer: ProcessCas called");
    JCas jcas;
    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new ResourceProcessException(e);
    }

    try {
      if (firstCall) {
        firstCall = false;
        System.out.println("Time: " + (System.currentTimeMillis() - startTime)
                + " DB Writer: First Time Initiailization: ");
        // NOTE TO USERS: a better design will be to do the loading of the
        // driver in the initialize() method, where it can
        // throw a ResourceInitialization Exception if it can't
        // load (and perhaps even connect to) the database

        // load the driver
        // Depends on "derby.system.property" set in initialize
        // Different databases have different classes they load
        // This is the one for derby - for the "Embedded" database
        // (derby also has a network accessed database driver)
        if (firstEverCall) {
          firstEverCall = false;
          System.out
                  .println("Time: "
                          + (System.currentTimeMillis() - startTime)
                          + " DB Writer: Doing first process call ever (even during re-runs) initialization");
          try {
            // note: newInstance() call is needed to reinitialize properly after
            // derby has been shutdown
            Class.forName("org.apache.derby.jdbc.EmbeddedDriver").newInstance();
            System.out.println("Time: " + (System.currentTimeMillis() - startTime)
                    + " DB Writer:    Loaded derby DB driver OK");
          } catch (ClassNotFoundException e) {
            System.err.println("No driver found for derby - check class path.");
          } catch (InstantiationException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
          } catch (IllegalAccessException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
          }
        }
        // create and connect to "ExamplePersonTitleDB" -
        // in derby, this is a directory name under the "System"
        // directory set with the above parameter
        // the System directory is passed to the JVM as a property
        // (see the derby documentation). Or - you can specify a
        // complete path, using //forward slashes on windows, such as
        // "jdbc:derby:c:/a/b/myDB".

        // NOTE TO USERS: In a real application you would probably
        // not delete the database and re-create it - this takes a long
        // time. If you want to get rid of the particular table you're
        // about to create if it already exists, use
        // the SQL "Drop" command

        // if database exists, delete it
        File db = new File(mOutputDir.toString() + "/ExamplePersonTitleDB");
        if (db.exists()) {
          System.out.println("Time: " + (System.currentTimeMillis() - startTime)
                  + " DB Writer: First Time Initiailization: Deleting Database");
          deleteDir(db);
          System.out.println("Time: " + (System.currentTimeMillis() - startTime)
                  + " DB Writer: First Time Initiailization: Database deleted");

        }

        con = DriverManager.getConnection("jdbc:derby:ExamplePersonTitleDB;create=true");
        System.out
                .println("Time: "
                        + (System.currentTimeMillis() - startTime)
                        + " DB Writer: First Time Initiailization: Created the ExamplePersonTitleDB and connected to it.");

        // Databases typically use user-names and passwords; these can
        // be passed as //properties to the getConnection method.

        // drop the table in case it's already present
        // This isn't needed because we're starting from an empty database,
        // but leave here for tutorial reasons
        Statement sqlStmt = con.createStatement();
        try {
          sqlStmt.execute("drop table PersonTitle");
        } catch (SQLException e) {
        }
        sqlStmt.execute("create table PersonTitle(" + "uri varchar(" + MAX_URI_LENGTH
                + "), spannedText varchar(" + MAX_TITLE_LENGTH
                + "), beginOffset int, endOffset int)");
        System.out.println("Time: " + (System.currentTimeMillis() - startTime)
                + " DB Writer: First Time Initiailization: Created the PersonTitle table.");

        sqlStmt.close(); // free resources associated with this
        // statement

        // Entering data: use a Statement, or a PreparedStatement
        stmt = con.prepareStatement("insert into PersonTitle values (?, ?, ?, ?)");
        // Writing out all instances of type uima.tcas.Annotation
        // Assume the variable "jcas" holds a reference to a JCas
        con.setAutoCommit(false); // need this for batch updating
      }

      // get the singleton instance of the SourceDocumentInformation
      SourceDocumentInformation sdi = (SourceDocumentInformation) 
              jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator().next();

      System.out.println("Time: " + (System.currentTimeMillis() - startTime)
              + " DB Writer: Processing doc: '" + sdi.getUri() + "'");

      stmt.setString(1, truncate(sdi.getUri(), MAX_URI_LENGTH));
      for (FSIterator iter = jcas.getAnnotationIndex(PersonTitle.type).iterator(); 
           iter.hasNext();) {
        PersonTitle pt = (PersonTitle) iter.next();
        stmt.setString(2, truncate(pt.getCoveredText(), MAX_TITLE_LENGTH));
        stmt.setInt(3, pt.getBegin());
        stmt.setInt(4, pt.getEnd());
        stmt.addBatch();
        batchCounter--;
        if (batchCounter <= 0) {
          System.out.println("Time: " + (System.currentTimeMillis() - startTime)
                  + " DB Writer: Batch writing updates - process call");
          stmt.executeBatch();
          // NOTE TO USERS: Although we "commit" here, you may want
          // to delay committing until batchProcessComplete - or some
          // other logical point - to keep the DB in a more consistent
          // state (not partially updated).

          con.commit();
          batchCounter = DB_LOAD_BATCH_SIZE;
        }
      }
    } catch (SQLException e) {
      try {
        // NOTE TO USERS: depending on your error recover logic, you'll
        // probably want to do both a rollback and a clearBatch if an
        // exception occurs.
        con.rollback();
      } catch (SQLException e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
      }
      try {
        stmt.clearBatch();
      } catch (SQLException e2) {
        // TODO Auto-generated catch block
        e2.printStackTrace();
      }
      throw new ResourceProcessException(e);
    }
  }

00280   public void collectionProcessComplete(ProcessTrace arg0) throws ResourceProcessException,
          IOException {
    firstCall = true;

    try {
      if (batchCounter < DB_LOAD_BATCH_SIZE) {
        System.out.println("Time: " + (System.currentTimeMillis() - startTime)
                + " DB Writer: Batch writing updates - processComplete call");
        stmt.executeBatch();
        con.commit();
        batchCounter = DB_LOAD_BATCH_SIZE;
      }

      stmt.close();
      con.close();
      System.out.println("Time: " + (System.currentTimeMillis() - startTime)
              + " DB Writer: Sucessfully closed the connection - done.");

    } catch (SQLException e) {
      System.err.println("Unexpected SQL exception");
      e.printStackTrace();
    }
    try {
      DriverManager.getConnection("jdbc:derby:ExamplePersonTitleDB;shutdown=true");
    } catch (SQLException e) {
    }

    // If we shut down the db - we get a "no suitable driver" SQL exception if rerunning
    try {
      firstEverCall = true;
      DriverManager.getConnection("jdbc:derby:;shutdown=true");
    } catch (SQLException e) {
    }
  }

  private void deleteDir(File f) {
    if (f.isDirectory()) {
      String[] contents = f.list();
      for (int i = 0; i < contents.length; i++) {
        deleteDir(new File(f.toString() + "/" + contents[i]));
      }
    }
    f.delete();
  }

  private String truncate(String s, int length) {
    if (s.length() <= length)
      return s;
    return s.substring(0, length);
  }

}

Generated by  Doxygen 1.6.0   Back to index