Spaces:

Yaoliang
/

fengkaobiguo

Sleeping

App Files Files Community

Yaoliang commited on Jul 8, 2025

Commit

f20f7ac

verified ·

1 Parent(s): 2f01eb2

Update document_processor.py

Browse files

Files changed (1) hide show

document_processor.py +11 -31

document_processor.py CHANGED Viewed

@@ -26,7 +26,6 @@ class DocumentProcessor:
         )
     def read_text_file(self, file_path: str) -> str:
-        """读取文本文件"""
         try:
             with open(file_path, 'r', encoding='utf-8') as file:
                 return file.read()
@@ -35,7 +34,6 @@ class DocumentProcessor:
             return ""
     def read_pdf_file(self, file_path: str) -> str:
-        """读取PDF文件"""
         try:
             text = ""
             with open(file_path, 'rb') as file:
@@ -48,7 +46,6 @@ class DocumentProcessor:
             return ""
     def read_docx_file(self, file_path: str) -> str:
-        """读取Word文档"""
         try:
             doc = Document(file_path)
             text = ""
@@ -60,11 +57,9 @@ class DocumentProcessor:
             return ""
     def read_markdown_file(self, file_path: str) -> str:
-        """读取Markdown文件"""
         try:
             with open(file_path, 'r', encoding='utf-8') as file:
                 md_content = file.read()
-                # 转换为纯文本
                 html = markdown.markdown(md_content)
                 soup = BeautifulSoup(html, 'html.parser')
                 return soup.get_text()
@@ -73,10 +68,7 @@ class DocumentProcessor:
             return ""
     def process_file(self, file_path: str) -> List[LangchainDocument]:
-        """处理单个文件并返回文档块"""
         file_extension = os.path.splitext(file_path)[1].lower()
-        # 根据文件类型选择读取方法
         if file_extension == '.txt':
             content = self.read_text_file(file_path)
         elif file_extension == '.pdf':
@@ -88,12 +80,9 @@ class DocumentProcessor:
         else:
             logger.warning(f"不支持的文件格式: {file_extension}")
             return []
         if not content.strip():
             logger.warning(f"文件内容为空: {file_path}")
             return []
-        # 创建Langchain文档对象
         doc = LangchainDocument(
             page_content=content,
             metadata={
@@ -102,28 +91,21 @@ class DocumentProcessor:
                 "file_name": os.path.basename(file_path)
             }
         )
-        # 分割文档
         chunks = self.text_splitter.split_documents([doc])
         logger.info(f"文件 {file_path} 被分割为 {len(chunks)} 个块")
         return chunks
     def process_directory(self, directory_path: str) -> List[LangchainDocument]:
-        """处理目录中的所有支持的文件"""
         all_chunks = []
         supported_formats = ['.txt', '.pdf', '.docx', '.md']
         for root, dirs, files in os.walk(directory_path):
             for file in files:
                 file_path = os.path.join(root, file)
                 file_extension = os.path.splitext(file)[1].lower()
                 if file_extension in supported_formats:
                     logger.info(f"处理文件: {file_path}")
                     chunks = self.process_file(file_path)
                     all_chunks.extend(chunks)
         logger.info(f"总共处理了 {len(all_chunks)} 个文档块")
         return all_chunks
@@ -132,18 +114,16 @@ def qa_func(text, model_type="deepseek", max_questions=5):
     return result
 iface = gr.Interface(
-       fn=qa_func,
-       inputs=[
-           gr.Textbox(label="请输入你的文档内容或问题"),
-           gr.Radio(["deepseek", "stepfun", "gemini"], label="选择模型"),
-           gr.Slider(1, 10, value=5, label="生成问答对数量")
-       ],
-       outputs="text",
-       title="逢考必过·AI考试复习助手",
-       description="输入文档内容，自动生成高质量问答对，支持多模型切换。"
- )
 if __name__ == "__main__":
-    iface.launch()

         )
     def read_text_file(self, file_path: str) -> str:
         try:
             with open(file_path, 'r', encoding='utf-8') as file:
                 return file.read()
             return ""
     def read_pdf_file(self, file_path: str) -> str:
         try:
             text = ""
             with open(file_path, 'rb') as file:
             return ""
     def read_docx_file(self, file_path: str) -> str:
         try:
             doc = Document(file_path)
             text = ""
             return ""
     def read_markdown_file(self, file_path: str) -> str:
         try:
             with open(file_path, 'r', encoding='utf-8') as file:
                 md_content = file.read()
                 html = markdown.markdown(md_content)
                 soup = BeautifulSoup(html, 'html.parser')
                 return soup.get_text()
             return ""
     def process_file(self, file_path: str) -> List[LangchainDocument]:
         file_extension = os.path.splitext(file_path)[1].lower()
         if file_extension == '.txt':
             content = self.read_text_file(file_path)
         elif file_extension == '.pdf':
         else:
             logger.warning(f"不支持的文件格式: {file_extension}")
             return []
         if not content.strip():
             logger.warning(f"文件内容为空: {file_path}")
             return []
         doc = LangchainDocument(
             page_content=content,
             metadata={
                 "file_name": os.path.basename(file_path)
             }
         )
         chunks = self.text_splitter.split_documents([doc])
         logger.info(f"文件 {file_path} 被分割为 {len(chunks)} 个块")
         return chunks
     def process_directory(self, directory_path: str) -> List[LangchainDocument]:
         all_chunks = []
         supported_formats = ['.txt', '.pdf', '.docx', '.md']
         for root, dirs, files in os.walk(directory_path):
             for file in files:
                 file_path = os.path.join(root, file)
                 file_extension = os.path.splitext(file)[1].lower()
                 if file_extension in supported_formats:
                     logger.info(f"处理文件: {file_path}")
                     chunks = self.process_file(file_path)
                     all_chunks.extend(chunks)
         logger.info(f"总共处理了 {len(all_chunks)} 个文档块")
         return all_chunks
     return result
 iface = gr.Interface(
+    fn=qa_func,
+    inputs=[
+        gr.Textbox(label="请输入你的文档内容或问题"),
+        gr.Radio(["deepseek", "stepfun", "gemini"], label="选择模型"),
+        gr.Slider(1, 10, value=5, label="生成问答对数量")
+    ],
+    outputs="text",
+    title="逢考必过·AI考试复习助手",
+    description="输入文档内容，自动生成高质量问答对，支持多模型切换。"
+)
 if __name__ == "__main__":
+    iface.launch()