2011/07/21

Elasticsearch in 10 Minutes

We produce a lot of PDF files with documentation. Way too many. The problem is that nobody knows to which document this or that documentation belongs or where it really is.

Some fulltext would help. I recently stumbled over elasticsearch - schema-free, scalable search engine based on Apache Lucene. I decided to give it a try -  not because its distributed nature, but for its REST interface.

I did following  4 steps to get simple fulltext search working:

1/ Extracted text from PDFs using pdftotext and simple bash one-liner.
for FILE in $(ls *.pdf); do pdftotext $FILE; done

2/ Created Java Maven project. The elasticsearch's pom.xml I found did not contain necessary dependencies, so I had to add them to my pom.xml and the result is a bit messy.
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
                             http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>docsearch</groupId>
    <artifactId>docsearch</artifactId>
    <version>1.0</version>

    <repositories>
     <repository>
       <id>fuse</id>
       <url>http://repo.fusesource.com/maven2/</url>
     </repository>
    </repositories>

    <dependencies>
      <dependency>
        <groupId>org.elasticsearch</groupId>
        <artifactId>elasticsearch</artifactId>
        <version>0.16.0</version>
      </dependency>

      <dependency>
        <groupId>org.apache.lucene</groupId>
        <artifactId>lucene-core</artifactId>
        <version>3.3.0</version>
      </dependency>           

      <dependency>                  
        <groupId>org.apache.lucene</groupId>
        <artifactId>lucene-analyzers</artifactId>   
        <version>3.3.0</version>                            
      </dependency>                                               

      <dependency>                                                      
        <groupId>org.apache.lucene</groupId>                                    
        <artifactId>lucene-snowball</artifactId>                                          
        <version>3.0.3</version>                                                                    
      </dependency>                                                                                         
                                                                                                            
      <dependency>
        <groupId>org.apache.lucene</groupId>
        <artifactId>lucene-fast-vector-highlighter</artifactId>
        <version>3.0.3</version>
      </dependency>

      <dependency>
        <groupId>org.apache.lucene</groupId>
        <artifactId>lucene-highlighter</artifactId>
        <version>2.4.0</version>
      </dependency>

      <dependency>
        <groupId>org.apache.lucene</groupId>
        <artifactId>lucene-queries</artifactId>
        <version>2.4.0</version>
      </dependency>

    </dependencies>

</project> 
 
3/ Downloaded the elesticsearch release and started it.

4/ Wrote a simple Java code to iterate over files, read them line-by-line and feed them to the running elasticsearch service:

import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.io.Files;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.node.Node;

import java.io.*;

import static java.lang.System.out;
import static org.elasticsearch.common.xcontent.XContentFactory.*;
import static org.elasticsearch.node.NodeBuilder.*;

public class Main
{
  final static String dataDirName = "/tmp/doc";

  public static void main (String[] args)
  {
     File dataDir = new File(dataDirName);

     if ( dataDir.exists() && dataDir.isDirectory() )
     {
        File[] files = dataDir.listFiles
        (
           new FilenameFilter()
           {  
              public boolean accept(File dir, String name)
              { return name.endsWith("txt"); }   
           }
        );
  
        // esearch client creation
        Node node = nodeBuilder().node();
        Client client = new TransportClient()
                       .addTransportAddress(new InetSocketTransportAddress("localhost", 9300));
  
        String indexName = "docs";
        String docType = "doc";
        String docId = null;
        for (File file : files)
        {
           try
           {
              BufferedReader reader = new BufferedReader ( new FileReader(file) );
              String line;
              StringBuilder fileContent = new StringBuilder();
              while ( (line = reader.readLine()) != null)
              { fileContent.append(line); }

              docId = file.getName();
              IndexResponse response =
                client.prepareIndex(indexName,docType,docId).setSource
                  ( jsonBuilder() .startObject().field("content", fileContent).endObject() )
                .execute().actionGet();
           }
           catch (FileNotFoundException ex) { ex.printStackTrace(); }
           catch (IOException ex) { ex.printStackTrace(); }
        }

        node.close();
     }
  }
} 

No comments:

Post a Comment