fengkaobiguo / document_processor.py
Yaoliang's picture
只保留gemini模型
221e227
import os
import PyPDF2
from docx import Document
import markdown
from bs4 import BeautifulSoup
from typing import List, Dict, Any
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document as LangchainDocument
import logging
import gradio as gr
from simple_qa import call_llm_api
from dotenv import load_dotenv
load_dotenv()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DocumentProcessor:
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
)
def read_text_file(self, file_path: str) -> str:
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except Exception as e:
logger.error(f"读取文本文件失败: {e}")
return ""
def read_pdf_file(self, file_path: str) -> str:
try:
text = ""
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
logger.error(f"读取PDF文件失败: {e}")
return ""
def read_docx_file(self, file_path: str) -> str:
try:
doc = Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
except Exception as e:
logger.error(f"读取Word文档失败: {e}")
return ""
def read_markdown_file(self, file_path: str) -> str:
try:
with open(file_path, 'r', encoding='utf-8') as file:
md_content = file.read()
html = markdown.markdown(md_content)
soup = BeautifulSoup(html, 'html.parser')
return soup.get_text()
except Exception as e:
logger.error(f"读取Markdown文件失败: {e}")
return ""
def process_file(self, file_path: str) -> List[LangchainDocument]:
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == '.txt':
content = self.read_text_file(file_path)
elif file_extension == '.pdf':
content = self.read_pdf_file(file_path)
elif file_extension == '.docx':
content = self.read_docx_file(file_path)
elif file_extension == '.md':
content = self.read_markdown_file(file_path)
else:
logger.warning(f"不支持的文件格式: {file_extension}")
return []
if not content.strip():
logger.warning(f"文件内容为空: {file_path}")
return []
doc = LangchainDocument(
page_content=content,
metadata={
"source": file_path,
"file_type": file_extension,
"file_name": os.path.basename(file_path)
}
)
chunks = self.text_splitter.split_documents([doc])
logger.info(f"文件 {file_path} 被分割为 {len(chunks)} 个块")
return chunks
def process_directory(self, directory_path: str) -> List[LangchainDocument]:
all_chunks = []
supported_formats = ['.txt', '.pdf', '.docx', '.md']
for root, dirs, files in os.walk(directory_path):
for file in files:
file_path = os.path.join(root, file)
file_extension = os.path.splitext(file)[1].lower()
if file_extension in supported_formats:
logger.info(f"处理文件: {file_path}")
chunks = self.process_file(file_path)
all_chunks.extend(chunks)
logger.info(f"总共处理了 {len(all_chunks)} 个文档块")
return all_chunks
def qa_func(text, model_type="deepseek", max_questions=5):
result = call_llm_api(text, model_type=model_type, max_questions=max_questions)
return result
iface = gr.Interface(
fn=qa_func,
inputs=[
gr.Textbox(label="请输入你的文档内容或问题"),
gr.Radio(["gemini"], label="选择模型"),
gr.Slider(1, 10, value=5, label="生成问答对数量")
],
outputs="text",
title="逢考必过·AI考试复习助手",
description="输入文档内容,自动生成高质量问答对,仅支持 Gemini 大模型。"
)
if __name__ == "__main__":
iface.launch()