发布时间:2025-12-09 11:55:51 浏览次数:1
import org.apache.tika.parser.Parser; //导入依赖的package包/类@Overridepublic void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)throws IOException{try{Metadata meta = new Metadata();ContentHandler handler = new BodyContentHandler();Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));parser.parse(input, handler, meta, new ParseContext());String content = handler.toString();if( content.length() > maxSize ){content = content.substring(0, maxSize);}outputText.append(content);if( LOGGER.isDebugEnabled() ){LOGGER.debug("Word Summary:" + content); //$NON-NLS-1$}}catch( Exception e ){throw new RuntimeException(e);}} import org.apache.tika.parser.Parser; //导入依赖的package包/类/** * Converts a .docx document into HTML markup. This code * is based on <a href="http://stackoverflow.com/a/9053258/313554">this StackOverflow</a> answer. * * @param wordDocument The converted .docx document. * @return */public ConvertedDocumentDTO convertWordDocumentIntoHtml(MultipartFile wordDocument) { LOGGER.info("Converting word document: {} into HTML", wordDocument.getOriginalFilename()); try { InputStream input = wordDocument.getInputStream(); Parser parser = new OOXMLParser(); StringWriter sw = new StringWriter(); SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8"); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); handler.setResult(new StreamResult(sw)); Metadata metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, "text/html;charset=utf-8"); parser.parse(input, handler, metadata, new ParseContext()); return new ConvertedDocumentDTO(wordDocument.getOriginalFilename(), sw.toString()); } catch (IOException | SAXException | TransformerException | TikaException ex) { LOGGER.error("Conversion failed because an exception was thrown", ex); throw new DocumentConversionException(ex.getMessage(), ex); }} import org.apache.tika.parser.Parser; //导入依赖的package包/类private static String getFullText(final String filepath) throws IOException, SAXException, TikaException { StringWriter writer = new StringWriter(); final TikaInputStream inputStream = TikaInputStream.get(new File(filepath)); try { final Detector detector = new DefaultDetector(); final Parser parser = new AutoDetectParser(detector); final Metadata metadata = new Metadata(); final ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); ContentHandler contentHandler = new BodyContentHandler(writer); parser.parse(inputStream, contentHandler, metadata, parseContext); } finally { inputStream.close(); } return writer.toString();}