Yaoliang commited on
Commit
f20f7ac
·
verified ·
1 Parent(s): 2f01eb2

Update document_processor.py

Browse files
Files changed (1) hide show
  1. document_processor.py +11 -31
document_processor.py CHANGED
@@ -26,7 +26,6 @@ class DocumentProcessor:
26
  )
27
 
28
  def read_text_file(self, file_path: str) -> str:
29
- """读取文本文件"""
30
  try:
31
  with open(file_path, 'r', encoding='utf-8') as file:
32
  return file.read()
@@ -35,7 +34,6 @@ class DocumentProcessor:
35
  return ""
36
 
37
  def read_pdf_file(self, file_path: str) -> str:
38
- """读取PDF文件"""
39
  try:
40
  text = ""
41
  with open(file_path, 'rb') as file:
@@ -48,7 +46,6 @@ class DocumentProcessor:
48
  return ""
49
 
50
  def read_docx_file(self, file_path: str) -> str:
51
- """读取Word文档"""
52
  try:
53
  doc = Document(file_path)
54
  text = ""
@@ -60,11 +57,9 @@ class DocumentProcessor:
60
  return ""
61
 
62
  def read_markdown_file(self, file_path: str) -> str:
63
- """读取Markdown文件"""
64
  try:
65
  with open(file_path, 'r', encoding='utf-8') as file:
66
  md_content = file.read()
67
- # 转换为纯文本
68
  html = markdown.markdown(md_content)
69
  soup = BeautifulSoup(html, 'html.parser')
70
  return soup.get_text()
@@ -73,10 +68,7 @@ class DocumentProcessor:
73
  return ""
74
 
75
  def process_file(self, file_path: str) -> List[LangchainDocument]:
76
- """处理单个文件并返回文档块"""
77
  file_extension = os.path.splitext(file_path)[1].lower()
78
-
79
- # 根据文件类型选择读取方法
80
  if file_extension == '.txt':
81
  content = self.read_text_file(file_path)
82
  elif file_extension == '.pdf':
@@ -88,12 +80,9 @@ class DocumentProcessor:
88
  else:
89
  logger.warning(f"不支持的文件格式: {file_extension}")
90
  return []
91
-
92
  if not content.strip():
93
  logger.warning(f"文件内容为空: {file_path}")
94
  return []
95
-
96
- # 创建Langchain文档对象
97
  doc = LangchainDocument(
98
  page_content=content,
99
  metadata={
@@ -102,28 +91,21 @@ class DocumentProcessor:
102
  "file_name": os.path.basename(file_path)
103
  }
104
  )
105
-
106
- # 分割文档
107
  chunks = self.text_splitter.split_documents([doc])
108
  logger.info(f"文件 {file_path} 被分割为 {len(chunks)} 个块")
109
-
110
  return chunks
111
 
112
  def process_directory(self, directory_path: str) -> List[LangchainDocument]:
113
- """处理目录中的所有支持的文件"""
114
  all_chunks = []
115
  supported_formats = ['.txt', '.pdf', '.docx', '.md']
116
-
117
  for root, dirs, files in os.walk(directory_path):
118
  for file in files:
119
  file_path = os.path.join(root, file)
120
  file_extension = os.path.splitext(file)[1].lower()
121
-
122
  if file_extension in supported_formats:
123
  logger.info(f"处理文件: {file_path}")
124
  chunks = self.process_file(file_path)
125
  all_chunks.extend(chunks)
126
-
127
  logger.info(f"总共处理了 {len(all_chunks)} 个文档块")
128
  return all_chunks
129
 
@@ -132,18 +114,16 @@ def qa_func(text, model_type="deepseek", max_questions=5):
132
  return result
133
 
134
  iface = gr.Interface(
135
- fn=qa_func,
136
- inputs=[
137
- gr.Textbox(label="请输入你的文档内容或问题"),
138
- gr.Radio(["deepseek", "stepfun", "gemini"], label="选择模型"),
139
- gr.Slider(1, 10, value=5, label="生成问答对数量")
140
- ],
141
- outputs="text",
142
- title="逢考必过·AI考试复习助手",
143
- description="输入文档内容,自动生成高质量问答对,支持多模型切换。"
144
- )
145
-
146
-
147
 
148
  if __name__ == "__main__":
149
- iface.launch()
 
26
  )
27
 
28
  def read_text_file(self, file_path: str) -> str:
 
29
  try:
30
  with open(file_path, 'r', encoding='utf-8') as file:
31
  return file.read()
 
34
  return ""
35
 
36
  def read_pdf_file(self, file_path: str) -> str:
 
37
  try:
38
  text = ""
39
  with open(file_path, 'rb') as file:
 
46
  return ""
47
 
48
  def read_docx_file(self, file_path: str) -> str:
 
49
  try:
50
  doc = Document(file_path)
51
  text = ""
 
57
  return ""
58
 
59
  def read_markdown_file(self, file_path: str) -> str:
 
60
  try:
61
  with open(file_path, 'r', encoding='utf-8') as file:
62
  md_content = file.read()
 
63
  html = markdown.markdown(md_content)
64
  soup = BeautifulSoup(html, 'html.parser')
65
  return soup.get_text()
 
68
  return ""
69
 
70
  def process_file(self, file_path: str) -> List[LangchainDocument]:
 
71
  file_extension = os.path.splitext(file_path)[1].lower()
 
 
72
  if file_extension == '.txt':
73
  content = self.read_text_file(file_path)
74
  elif file_extension == '.pdf':
 
80
  else:
81
  logger.warning(f"不支持的文件格式: {file_extension}")
82
  return []
 
83
  if not content.strip():
84
  logger.warning(f"文件内容为空: {file_path}")
85
  return []
 
 
86
  doc = LangchainDocument(
87
  page_content=content,
88
  metadata={
 
91
  "file_name": os.path.basename(file_path)
92
  }
93
  )
 
 
94
  chunks = self.text_splitter.split_documents([doc])
95
  logger.info(f"文件 {file_path} 被分割为 {len(chunks)} 个块")
 
96
  return chunks
97
 
98
  def process_directory(self, directory_path: str) -> List[LangchainDocument]:
 
99
  all_chunks = []
100
  supported_formats = ['.txt', '.pdf', '.docx', '.md']
 
101
  for root, dirs, files in os.walk(directory_path):
102
  for file in files:
103
  file_path = os.path.join(root, file)
104
  file_extension = os.path.splitext(file)[1].lower()
 
105
  if file_extension in supported_formats:
106
  logger.info(f"处理文件: {file_path}")
107
  chunks = self.process_file(file_path)
108
  all_chunks.extend(chunks)
 
109
  logger.info(f"总共处理了 {len(all_chunks)} 个文档块")
110
  return all_chunks
111
 
 
114
  return result
115
 
116
  iface = gr.Interface(
117
+ fn=qa_func,
118
+ inputs=[
119
+ gr.Textbox(label="请输入你的文档内容或问题"),
120
+ gr.Radio(["deepseek", "stepfun", "gemini"], label="选择模型"),
121
+ gr.Slider(1, 10, value=5, label="生成问答对数量")
122
+ ],
123
+ outputs="text",
124
+ title="逢考必过·AI考试复习助手",
125
+ description="输入文档内容,自动生成高质量问答对,支持多模型切换。"
126
+ )
 
 
127
 
128
  if __name__ == "__main__":
129
+ iface.launch()