Spaces:
Sleeping
Sleeping
Update document_processor.py
Browse files- document_processor.py +11 -31
document_processor.py
CHANGED
|
@@ -26,7 +26,6 @@ class DocumentProcessor:
|
|
| 26 |
)
|
| 27 |
|
| 28 |
def read_text_file(self, file_path: str) -> str:
|
| 29 |
-
"""读取文本文件"""
|
| 30 |
try:
|
| 31 |
with open(file_path, 'r', encoding='utf-8') as file:
|
| 32 |
return file.read()
|
|
@@ -35,7 +34,6 @@ class DocumentProcessor:
|
|
| 35 |
return ""
|
| 36 |
|
| 37 |
def read_pdf_file(self, file_path: str) -> str:
|
| 38 |
-
"""读取PDF文件"""
|
| 39 |
try:
|
| 40 |
text = ""
|
| 41 |
with open(file_path, 'rb') as file:
|
|
@@ -48,7 +46,6 @@ class DocumentProcessor:
|
|
| 48 |
return ""
|
| 49 |
|
| 50 |
def read_docx_file(self, file_path: str) -> str:
|
| 51 |
-
"""读取Word文档"""
|
| 52 |
try:
|
| 53 |
doc = Document(file_path)
|
| 54 |
text = ""
|
|
@@ -60,11 +57,9 @@ class DocumentProcessor:
|
|
| 60 |
return ""
|
| 61 |
|
| 62 |
def read_markdown_file(self, file_path: str) -> str:
|
| 63 |
-
"""读取Markdown文件"""
|
| 64 |
try:
|
| 65 |
with open(file_path, 'r', encoding='utf-8') as file:
|
| 66 |
md_content = file.read()
|
| 67 |
-
# 转换为纯文本
|
| 68 |
html = markdown.markdown(md_content)
|
| 69 |
soup = BeautifulSoup(html, 'html.parser')
|
| 70 |
return soup.get_text()
|
|
@@ -73,10 +68,7 @@ class DocumentProcessor:
|
|
| 73 |
return ""
|
| 74 |
|
| 75 |
def process_file(self, file_path: str) -> List[LangchainDocument]:
|
| 76 |
-
"""处理单个文件并返回文档块"""
|
| 77 |
file_extension = os.path.splitext(file_path)[1].lower()
|
| 78 |
-
|
| 79 |
-
# 根据文件类型选择读取方法
|
| 80 |
if file_extension == '.txt':
|
| 81 |
content = self.read_text_file(file_path)
|
| 82 |
elif file_extension == '.pdf':
|
|
@@ -88,12 +80,9 @@ class DocumentProcessor:
|
|
| 88 |
else:
|
| 89 |
logger.warning(f"不支持的文件格式: {file_extension}")
|
| 90 |
return []
|
| 91 |
-
|
| 92 |
if not content.strip():
|
| 93 |
logger.warning(f"文件内容为空: {file_path}")
|
| 94 |
return []
|
| 95 |
-
|
| 96 |
-
# 创建Langchain文档对象
|
| 97 |
doc = LangchainDocument(
|
| 98 |
page_content=content,
|
| 99 |
metadata={
|
|
@@ -102,28 +91,21 @@ class DocumentProcessor:
|
|
| 102 |
"file_name": os.path.basename(file_path)
|
| 103 |
}
|
| 104 |
)
|
| 105 |
-
|
| 106 |
-
# 分割文档
|
| 107 |
chunks = self.text_splitter.split_documents([doc])
|
| 108 |
logger.info(f"文件 {file_path} 被分割为 {len(chunks)} 个块")
|
| 109 |
-
|
| 110 |
return chunks
|
| 111 |
|
| 112 |
def process_directory(self, directory_path: str) -> List[LangchainDocument]:
|
| 113 |
-
"""处理目录中的所有支持的文件"""
|
| 114 |
all_chunks = []
|
| 115 |
supported_formats = ['.txt', '.pdf', '.docx', '.md']
|
| 116 |
-
|
| 117 |
for root, dirs, files in os.walk(directory_path):
|
| 118 |
for file in files:
|
| 119 |
file_path = os.path.join(root, file)
|
| 120 |
file_extension = os.path.splitext(file)[1].lower()
|
| 121 |
-
|
| 122 |
if file_extension in supported_formats:
|
| 123 |
logger.info(f"处理文件: {file_path}")
|
| 124 |
chunks = self.process_file(file_path)
|
| 125 |
all_chunks.extend(chunks)
|
| 126 |
-
|
| 127 |
logger.info(f"总共处理了 {len(all_chunks)} 个文档块")
|
| 128 |
return all_chunks
|
| 129 |
|
|
@@ -132,18 +114,16 @@ def qa_func(text, model_type="deepseek", max_questions=5):
|
|
| 132 |
return result
|
| 133 |
|
| 134 |
iface = gr.Interface(
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
|
| 148 |
if __name__ == "__main__":
|
| 149 |
-
iface.launch()
|
|
|
|
| 26 |
)
|
| 27 |
|
| 28 |
def read_text_file(self, file_path: str) -> str:
|
|
|
|
| 29 |
try:
|
| 30 |
with open(file_path, 'r', encoding='utf-8') as file:
|
| 31 |
return file.read()
|
|
|
|
| 34 |
return ""
|
| 35 |
|
| 36 |
def read_pdf_file(self, file_path: str) -> str:
|
|
|
|
| 37 |
try:
|
| 38 |
text = ""
|
| 39 |
with open(file_path, 'rb') as file:
|
|
|
|
| 46 |
return ""
|
| 47 |
|
| 48 |
def read_docx_file(self, file_path: str) -> str:
|
|
|
|
| 49 |
try:
|
| 50 |
doc = Document(file_path)
|
| 51 |
text = ""
|
|
|
|
| 57 |
return ""
|
| 58 |
|
| 59 |
def read_markdown_file(self, file_path: str) -> str:
|
|
|
|
| 60 |
try:
|
| 61 |
with open(file_path, 'r', encoding='utf-8') as file:
|
| 62 |
md_content = file.read()
|
|
|
|
| 63 |
html = markdown.markdown(md_content)
|
| 64 |
soup = BeautifulSoup(html, 'html.parser')
|
| 65 |
return soup.get_text()
|
|
|
|
| 68 |
return ""
|
| 69 |
|
| 70 |
def process_file(self, file_path: str) -> List[LangchainDocument]:
|
|
|
|
| 71 |
file_extension = os.path.splitext(file_path)[1].lower()
|
|
|
|
|
|
|
| 72 |
if file_extension == '.txt':
|
| 73 |
content = self.read_text_file(file_path)
|
| 74 |
elif file_extension == '.pdf':
|
|
|
|
| 80 |
else:
|
| 81 |
logger.warning(f"不支持的文件格式: {file_extension}")
|
| 82 |
return []
|
|
|
|
| 83 |
if not content.strip():
|
| 84 |
logger.warning(f"文件内容为空: {file_path}")
|
| 85 |
return []
|
|
|
|
|
|
|
| 86 |
doc = LangchainDocument(
|
| 87 |
page_content=content,
|
| 88 |
metadata={
|
|
|
|
| 91 |
"file_name": os.path.basename(file_path)
|
| 92 |
}
|
| 93 |
)
|
|
|
|
|
|
|
| 94 |
chunks = self.text_splitter.split_documents([doc])
|
| 95 |
logger.info(f"文件 {file_path} 被分割为 {len(chunks)} 个块")
|
|
|
|
| 96 |
return chunks
|
| 97 |
|
| 98 |
def process_directory(self, directory_path: str) -> List[LangchainDocument]:
|
|
|
|
| 99 |
all_chunks = []
|
| 100 |
supported_formats = ['.txt', '.pdf', '.docx', '.md']
|
|
|
|
| 101 |
for root, dirs, files in os.walk(directory_path):
|
| 102 |
for file in files:
|
| 103 |
file_path = os.path.join(root, file)
|
| 104 |
file_extension = os.path.splitext(file)[1].lower()
|
|
|
|
| 105 |
if file_extension in supported_formats:
|
| 106 |
logger.info(f"处理文件: {file_path}")
|
| 107 |
chunks = self.process_file(file_path)
|
| 108 |
all_chunks.extend(chunks)
|
|
|
|
| 109 |
logger.info(f"总共处理了 {len(all_chunks)} 个文档块")
|
| 110 |
return all_chunks
|
| 111 |
|
|
|
|
| 114 |
return result
|
| 115 |
|
| 116 |
iface = gr.Interface(
|
| 117 |
+
fn=qa_func,
|
| 118 |
+
inputs=[
|
| 119 |
+
gr.Textbox(label="请输入你的文档内容或问题"),
|
| 120 |
+
gr.Radio(["deepseek", "stepfun", "gemini"], label="选择模型"),
|
| 121 |
+
gr.Slider(1, 10, value=5, label="生成问答对数量")
|
| 122 |
+
],
|
| 123 |
+
outputs="text",
|
| 124 |
+
title="逢考必过·AI考试复习助手",
|
| 125 |
+
description="输入文档内容,自动生成高质量问答对,支持多模型切换。"
|
| 126 |
+
)
|
|
|
|
|
|
|
| 127 |
|
| 128 |
if __name__ == "__main__":
|
| 129 |
+
iface.launch()
|