以下是从Microsoft Office文档中提取内容和元数据的程序.
import java.io.File;import java.io.FileInputStream;import java.io.IOException;import org.apache.tika.exception.TikaException;import org.apache.tika.metadata.Metadata;import org.apache.tika.parser.ParseContext;import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;import org.apache.tika.sax.BodyContentHandler;import org.xml.sax.SAXException;public class MSExcelParse { public static void main(final String[] args) throws IOException, TikaException { //detecting the file type BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); FileInputStream inputstream = new FileInputStream(new File("example_msExcel.xlsx")); ParseContext pcontext = new ParseContext(); //OOXml parser OOXMLParser msofficeparser = new OOXMLParser (); msofficeparser.parse(inputstream, handler, metadata,pcontext); System.out.println("Contents of the document:" + handler.toString()); System.out.println("Metadata of the document:"); String[] metadataNames = metadata.names(); for(String name : metadataNames) { System.out.println(name + ": " + metadata.get(name)); } }}
将上述代码保存为 MSExelParse.java ,使用以下命令从命令提示符编译它 :
javac MSExcelParse.javajava MSExcelParse
这里我们传递以下示例Excel文件.
Excel"/>
给定的Excel文件具有以下属性 :
Excel属性"/>
执行上述程序后,您将获得以下输出.
输出 :
Contents of the document:Sheet1NameAgeDesignationSalaryRamu50Manager50,000Raheem40Assistant manager40,000Robert30Superviser30,000sita25Clerk25,000sameer25Section in-charge20,000Metadata of the document:meta:creation-date: 2006-09-16T00:00:00Zdcterms:modified: 2014-09-28T15:18:41Zmeta:save-date: 2014-09-28T15:18:41ZApplication-Name: Microsoft Excelextended-properties:Company: dcterms:created: 2006-09-16T00:00:00ZLast-Modified: 2014-09-28T15:18:41ZApplication-Version: 15.0300date: 2014-09-28T15:18:41Zpublisher: modified: 2014-09-28T15:18:41ZCreation-Date: 2006-09-16T00:00:00Zextended-properties:AppVersion: 15.0300protected: falsedc:publisher: extended-properties:Application: Microsoft ExcelContent-Type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheetLast-Save-Date: 2014-09-28T15:18:41Z