import os import PyPDF2 from docx import Document import markdown from bs4 import BeautifulSoup from typing import List, Dict, Any from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document as LangchainDocument import logging import gradio as gr from simple_qa import call_llm_api from dotenv import load_dotenv load_dotenv() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class DocumentProcessor: def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, ) def read_text_file(self, file_path: str) -> str: try: with open(file_path, 'r', encoding='utf-8') as file: return file.read() except Exception as e: logger.error(f"读取文本文件失败: {e}") return "" def read_pdf_file(self, file_path: str) -> str: try: text = "" with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for page in pdf_reader.pages: text += page.extract_text() + "\n" return text except Exception as e: logger.error(f"读取PDF文件失败: {e}") return "" def read_docx_file(self, file_path: str) -> str: try: doc = Document(file_path) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text except Exception as e: logger.error(f"读取Word文档失败: {e}") return "" def read_markdown_file(self, file_path: str) -> str: try: with open(file_path, 'r', encoding='utf-8') as file: md_content = file.read() html = markdown.markdown(md_content) soup = BeautifulSoup(html, 'html.parser') return soup.get_text() except Exception as e: logger.error(f"读取Markdown文件失败: {e}") return "" def process_file(self, file_path: str) -> List[LangchainDocument]: file_extension = os.path.splitext(file_path)[1].lower() if file_extension == '.txt': content = self.read_text_file(file_path) elif file_extension == '.pdf': content = self.read_pdf_file(file_path) elif file_extension == '.docx': content = self.read_docx_file(file_path) elif file_extension == '.md': content = self.read_markdown_file(file_path) else: logger.warning(f"不支持的文件格式: {file_extension}") return [] if not content.strip(): logger.warning(f"文件内容为空: {file_path}") return [] doc = LangchainDocument( page_content=content, metadata={ "source": file_path, "file_type": file_extension, "file_name": os.path.basename(file_path) } ) chunks = self.text_splitter.split_documents([doc]) logger.info(f"文件 {file_path} 被分割为 {len(chunks)} 个块") return chunks def process_directory(self, directory_path: str) -> List[LangchainDocument]: all_chunks = [] supported_formats = ['.txt', '.pdf', '.docx', '.md'] for root, dirs, files in os.walk(directory_path): for file in files: file_path = os.path.join(root, file) file_extension = os.path.splitext(file)[1].lower() if file_extension in supported_formats: logger.info(f"处理文件: {file_path}") chunks = self.process_file(file_path) all_chunks.extend(chunks) logger.info(f"总共处理了 {len(all_chunks)} 个文档块") return all_chunks def qa_func(text, model_type="deepseek", max_questions=5): result = call_llm_api(text, model_type=model_type, max_questions=max_questions) return result iface = gr.Interface( fn=qa_func, inputs=[ gr.Textbox(label="请输入你的文档内容或问题"), gr.Radio(["gemini"], label="选择模型"), gr.Slider(1, 10, value=5, label="生成问答对数量") ], outputs="text", title="逢考必过·AI考试复习助手", description="输入文档内容,自动生成高质量问答对,仅支持 Gemini 大模型。" ) if __name__ == "__main__": iface.launch()