首页

关于使用pdfbox的对PDF文件通过lucene生成索引文件IndexPDFFiles类代码示例

标签:pdfbox,lucene,pdf索引生成,代码示例     发布时间:2018-04-15   

一、前言

下面通过pdfboxpdfbox-2.0.9源码中org.apache.pdfbox.examples.lucene.IndexPDFFiles索引PDF文件生成类,基于apachelucene的org.apache.lucene.index.IndexWriter生成pdf文件的所有的代码示例。

二、代码示例

package org.apache.pdfbox.examples.lucene;@b@@b@import java.io.File;@b@import java.io.FileInputStream;@b@import java.io.FileNotFoundException;@b@import java.io.IOException;@b@import java.util.Date;@b@@b@import org.apache.lucene.analysis.Analyzer;@b@import org.apache.lucene.analysis.standard.StandardAnalyzer;@b@import org.apache.lucene.document.Document;@b@import org.apache.lucene.index.IndexWriter;@b@import org.apache.lucene.index.IndexWriterConfig;@b@import org.apache.lucene.index.IndexWriterConfig.OpenMode;@b@import org.apache.lucene.index.Term;@b@import org.apache.lucene.store.Directory;@b@import org.apache.lucene.store.FSDirectory;@b@import org.apache.lucene.util.Version;@b@@b@/**@b@ * Index all pdf files under a directory.@b@ * <p>@b@ * This is a command-line application demonstrating simple Lucene indexing. Run it with no command-line arguments for@b@ * usage information.@b@ * <p>@b@ * It's based on a demo provided by the lucene project.@b@ */@b@public final class IndexPDFFiles@b@{@b@@b@    private IndexPDFFiles()@b@    {@b@    }@b@@b@    /**@b@     * Index all text files under a directory.@b@     * @b@     * @param args command line arguments@b@     * @b@     */@b@    public static void main(String[] args)@b@    {@b@        String usage = "java org.apache.pdfbox.lucene.IndexPDFFiles"@b@                + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"@b@                + "This indexes all PDF documents in DOCS_PATH, creating a Lucene index"@b@                + "in INDEX_PATH that can be searched with SearchFiles";@b@        String indexPath = "index";@b@        String docsPath = null;@b@        boolean create = true;@b@        for (int i = 0; i < args.length; i++)@b@        {@b@            if ("-index".equals(args[i]))@b@            {@b@                indexPath = args[i + 1];@b@                i++;@b@            }@b@            else if ("-docs".equals(args[i]))@b@            {@b@                docsPath = args[i + 1];@b@                i++;@b@            }@b@            else if ("-update".equals(args[i]))@b@            {@b@                create = false;@b@            }@b@        }@b@@b@        if (docsPath == null)@b@        {@b@            System.err.println("Usage: " + usage);@b@            System.exit(1);@b@        }@b@@b@        final File docDir = new File(docsPath);@b@        if (!docDir.exists() || !docDir.canRead())@b@        {@b@            System.out.println("Document directory '" + docDir.getAbsolutePath()@b@                    + "' does not exist or is not readable, please check the path");@b@            System.exit(1);@b@        }@b@@b@        Date start = new Date();@b@        try@b@        {@b@            System.out.println("Indexing to directory '" + indexPath + "'...");@b@@b@            Directory dir = FSDirectory.open(new File(indexPath));@b@            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);@b@            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer);@b@@b@            if (create)@b@            {@b@                // Create a new index in the directory, removing any@b@                // previously indexed documents:@b@                iwc.setOpenMode(OpenMode.CREATE);@b@            }@b@            else@b@            {@b@                // Add new documents to an existing index:@b@                iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);@b@            }@b@@b@            // Optional: for better indexing performance, if you@b@            // are indexing many documents, increase the RAM@b@            // buffer. But if you do this, increase the max heap@b@            // size to the JVM (eg add -Xmx512m or -Xmx1g):@b@            //@b@            // iwc.setRAMBufferSizeMB(256.0);@b@@b@            IndexWriter writer = new IndexWriter(dir, iwc);@b@            indexDocs(writer, docDir);@b@@b@            // NOTE: if you want to maximize search performance,@b@            // you can optionally call forceMerge here. This can be@b@            // a terribly costly operation, so generally it's only@b@            // worth it when your index is relatively static (ie@b@            // you're done adding documents to it):@b@            //@b@            // writer.forceMerge(1);@b@@b@            writer.close();@b@@b@            Date end = new Date();@b@            System.out.println(end.getTime() - start.getTime() + " total milliseconds");@b@@b@        }@b@        catch (IOException e)@b@        {@b@            System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());@b@        }@b@    }@b@@b@    /**@b@     * Indexes the given file using the given writer, or if a directory is given, recurses over files and directories@b@     * found under the given directory.@b@     * @b@     * NOTE: This method indexes one document per input file. This is slow. For good throughput, put multiple documents@b@     * into your input file(s). An example of this is in the benchmark module, which can create "line doc" files, one@b@     * document per line, using the <a@b@     * href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"@b@     * >WriteLineDocTask</a>.@b@     * @b@     * @param writer Writer to the index where the given file/dir info will be stored@b@     * @param file The file to index, or the directory to recurse into to find files to index@b@     * @throws IOException If there is a low-level I/O error@b@     */@b@    static void indexDocs(IndexWriter writer, File file) throws IOException@b@    {@b@        // do not try to index files that cannot be read@b@        if (file.canRead())@b@        {@b@            if (file.isDirectory())@b@            {@b@                String[] files = file.list();@b@                // an IO error could occur@b@                if (files != null)@b@                {@b@                    for (String fileName : files)@b@                    {@b@                        indexDocs(writer, new File(file, fileName));@b@                    }@b@                }@b@            }@b@            else@b@            {@b@@b@                FileInputStream fis;@b@                try@b@                {@b@                    fis = new FileInputStream(file);@b@                }@b@                catch (FileNotFoundException fnfe)@b@                {@b@                    // at least on windows, some temporary files raise this exception with an "access denied" message@b@                    // checking if the file can be read doesn't help@b@                    return;@b@                }@b@@b@                try@b@                {@b@@b@                    String path = file.getName().toUpperCase();@b@                    Document doc = null;@b@                    if (path.toLowerCase().endsWith(".pdf"))@b@                    {@b@                        System.out.println("Indexing PDF document: " + file);@b@                        doc = LucenePDFDocument.getDocument(file);@b@                    }@b@                    else@b@                    {@b@                        System.out.println("Skipping " + file);@b@                        return;@b@                    }@b@@b@                    if (writer.getConfig().getOpenMode() == OpenMode.CREATE)@b@                    {@b@                        // New index, so we just add the document (no old document can be there):@b@                        System.out.println("adding " + file);@b@                        writer.addDocument(doc);@b@                    }@b@                    else@b@                    {@b@                        // Existing index (an old copy of this document may have been indexed) so@b@                        // we use updateDocument instead to replace the old one matching the exact@b@                        // path, if present:@b@                        System.out.println("updating " + file);@b@                        writer.updateDocument(new Term("uid", LucenePDFDocument.createUID(file)), doc);@b@                    }@b@                }@b@                finally@b@                {@b@                    fis.close();@b@                }@b@            }@b@        }@b@    }@b@}