Spaces:

Yaoliang
/

fengkaobiguo

Sleeping

App Files Files Community

fengkaobiguo / document_processor.py

Yaoliang

只保留gemini模型

221e227 8 months ago

raw

history blame contribute delete

4.79 kB

	import os
	import PyPDF2
	from docx import Document
	import markdown
	from bs4 import BeautifulSoup
	from typing import List, Dict, Any
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.schema import Document as LangchainDocument
	import logging
	import gradio as gr
	from simple_qa import call_llm_api
	from dotenv import load_dotenv
	load_dotenv()

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class DocumentProcessor:
	def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	length_function=len,
	)

	def read_text_file(self, file_path: str) -> str:
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	return file.read()
	except Exception as e:
	logger.error(f"读取文本文件失败: {e}")
	return ""

	def read_pdf_file(self, file_path: str) -> str:
	try:
	text = ""
	with open(file_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	logger.error(f"读取PDF文件失败: {e}")
	return ""

	def read_docx_file(self, file_path: str) -> str:
	try:
	doc = Document(file_path)
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text
	except Exception as e:
	logger.error(f"读取Word文档失败: {e}")
	return ""

	def read_markdown_file(self, file_path: str) -> str:
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	md_content = file.read()
	html = markdown.markdown(md_content)
	soup = BeautifulSoup(html, 'html.parser')
	return soup.get_text()
	except Exception as e:
	logger.error(f"读取Markdown文件失败: {e}")
	return ""

	def process_file(self, file_path: str) -> List[LangchainDocument]:
	file_extension = os.path.splitext(file_path)[1].lower()
	if file_extension == '.txt':
	content = self.read_text_file(file_path)
	elif file_extension == '.pdf':
	content = self.read_pdf_file(file_path)
	elif file_extension == '.docx':
	content = self.read_docx_file(file_path)
	elif file_extension == '.md':
	content = self.read_markdown_file(file_path)
	else:
	logger.warning(f"不支持的文件格式: {file_extension}")
	return []
	if not content.strip():
	logger.warning(f"文件内容为空: {file_path}")
	return []
	doc = LangchainDocument(
	page_content=content,
	metadata={
	"source": file_path,
	"file_type": file_extension,
	"file_name": os.path.basename(file_path)
	}
	)
	chunks = self.text_splitter.split_documents([doc])
	logger.info(f"文件 {file_path} 被分割为 {len(chunks)} 个块")
	return chunks

	def process_directory(self, directory_path: str) -> List[LangchainDocument]:
	all_chunks = []
	supported_formats = ['.txt', '.pdf', '.docx', '.md']
	for root, dirs, files in os.walk(directory_path):
	for file in files:
	file_path = os.path.join(root, file)
	file_extension = os.path.splitext(file)[1].lower()
	if file_extension in supported_formats:
	logger.info(f"处理文件: {file_path}")
	chunks = self.process_file(file_path)
	all_chunks.extend(chunks)
	logger.info(f"总共处理了 {len(all_chunks)} 个文档块")
	return all_chunks

	def qa_func(text, model_type="deepseek", max_questions=5):
	result = call_llm_api(text, model_type=model_type, max_questions=max_questions)
	return result

	iface = gr.Interface(
	fn=qa_func,
	inputs=[
	gr.Textbox(label="请输入你的文档内容或问题"),
	gr.Radio(["gemini"], label="选择模型"),
	gr.Slider(1, 10, value=5, label="生成问答对数量")
	],
	outputs="text",
	title="逢考必过·AI考试复习助手",
	description="输入文档内容，自动生成高质量问答对，仅支持 Gemini 大模型。"
	)

	if __name__ == "__main__":
	iface.launch()