Java Parser类使用实例

发布时间:2025-12-09 11:55:51 浏览次数:1

实例1: extractText

import org.apache.tika.parser.Parser; //导入依赖的package包/类@Overridepublic void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)throws IOException{try{Metadata meta = new Metadata();ContentHandler handler = new BodyContentHandler();Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));parser.parse(input, handler, meta, new ParseContext());String content = handler.toString();if( content.length() > maxSize ){content = content.substring(0, maxSize);}outputText.append(content);if( LOGGER.isDebugEnabled() ){LOGGER.debug("Word Summary:" + content); //$NON-NLS-1$}}catch( Exception e ){throw new RuntimeException(e);}} 

实例2: convertWordDocumentIntoHtml

import org.apache.tika.parser.Parser; //导入依赖的package包/类/** * Converts a .docx document into HTML markup. This code * is based on <a href="http://stackoverflow.com/a/9053258/313554">this StackOverflow</a> answer. * * @param wordDocument  The converted .docx document. * @return */public ConvertedDocumentDTO convertWordDocumentIntoHtml(MultipartFile wordDocument) {    LOGGER.info("Converting word document: {} into HTML", wordDocument.getOriginalFilename());    try {        InputStream input = wordDocument.getInputStream();        Parser parser = new OOXMLParser();        StringWriter sw = new StringWriter();        SAXTransformerFactory factory = (SAXTransformerFactory)                SAXTransformerFactory.newInstance();        TransformerHandler handler = factory.newTransformerHandler();        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");        handler.setResult(new StreamResult(sw));        Metadata metadata = new Metadata();        metadata.add(Metadata.CONTENT_TYPE, "text/html;charset=utf-8");        parser.parse(input, handler, metadata, new ParseContext());        return new ConvertedDocumentDTO(wordDocument.getOriginalFilename(), sw.toString());    }    catch (IOException | SAXException | TransformerException | TikaException ex) {        LOGGER.error("Conversion failed because an exception was thrown", ex);        throw new DocumentConversionException(ex.getMessage(), ex);    }} 

实例3: getFullText

import org.apache.tika.parser.Parser; //导入依赖的package包/类private static String getFullText(final String filepath) throws IOException, SAXException, TikaException {    StringWriter writer = new StringWriter();        final TikaInputStream inputStream =  TikaInputStream.get(new File(filepath));    try {        final Detector detector = new DefaultDetector();        final Parser parser = new AutoDetectParser(detector);        final Metadata metadata = new Metadata();        final ParseContext parseContext = new ParseContext();        parseContext.set(Parser.class, parser);                ContentHandler contentHandler = new BodyContentHandler(writer);        parser.parse(inputStream, contentHandler, metadata, parseContext);    }    finally {        inputStream.close();    }        return writer.toString();} 
JAVAPARSER好妈妈儿
需要做网站?需要网络推广?欢迎咨询客户经理 13272073477