Spaces:
Sleeping
Sleeping
File size: 4,788 Bytes
90d1485 63306e9 cf6c1df 90d1485 c30406d 63306e9 c30406d 63306e9 f20f7ac 221e227 f20f7ac 221e227 f20f7ac c30406d 63306e9 f20f7ac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | import os
import PyPDF2
from docx import Document
import markdown
from bs4 import BeautifulSoup
from typing import List, Dict, Any
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document as LangchainDocument
import logging
import gradio as gr
from simple_qa import call_llm_api
from dotenv import load_dotenv
load_dotenv()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DocumentProcessor:
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
)
def read_text_file(self, file_path: str) -> str:
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except Exception as e:
logger.error(f"读取文本文件失败: {e}")
return ""
def read_pdf_file(self, file_path: str) -> str:
try:
text = ""
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
logger.error(f"读取PDF文件失败: {e}")
return ""
def read_docx_file(self, file_path: str) -> str:
try:
doc = Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
except Exception as e:
logger.error(f"读取Word文档失败: {e}")
return ""
def read_markdown_file(self, file_path: str) -> str:
try:
with open(file_path, 'r', encoding='utf-8') as file:
md_content = file.read()
html = markdown.markdown(md_content)
soup = BeautifulSoup(html, 'html.parser')
return soup.get_text()
except Exception as e:
logger.error(f"读取Markdown文件失败: {e}")
return ""
def process_file(self, file_path: str) -> List[LangchainDocument]:
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == '.txt':
content = self.read_text_file(file_path)
elif file_extension == '.pdf':
content = self.read_pdf_file(file_path)
elif file_extension == '.docx':
content = self.read_docx_file(file_path)
elif file_extension == '.md':
content = self.read_markdown_file(file_path)
else:
logger.warning(f"不支持的文件格式: {file_extension}")
return []
if not content.strip():
logger.warning(f"文件内容为空: {file_path}")
return []
doc = LangchainDocument(
page_content=content,
metadata={
"source": file_path,
"file_type": file_extension,
"file_name": os.path.basename(file_path)
}
)
chunks = self.text_splitter.split_documents([doc])
logger.info(f"文件 {file_path} 被分割为 {len(chunks)} 个块")
return chunks
def process_directory(self, directory_path: str) -> List[LangchainDocument]:
all_chunks = []
supported_formats = ['.txt', '.pdf', '.docx', '.md']
for root, dirs, files in os.walk(directory_path):
for file in files:
file_path = os.path.join(root, file)
file_extension = os.path.splitext(file)[1].lower()
if file_extension in supported_formats:
logger.info(f"处理文件: {file_path}")
chunks = self.process_file(file_path)
all_chunks.extend(chunks)
logger.info(f"总共处理了 {len(all_chunks)} 个文档块")
return all_chunks
def qa_func(text, model_type="deepseek", max_questions=5):
result = call_llm_api(text, model_type=model_type, max_questions=max_questions)
return result
iface = gr.Interface(
fn=qa_func,
inputs=[
gr.Textbox(label="请输入你的文档内容或问题"),
gr.Radio(["gemini"], label="选择模型"),
gr.Slider(1, 10, value=5, label="生成问答对数量")
],
outputs="text",
title="逢考必过·AI考试复习助手",
description="输入文档内容,自动生成高质量问答对,仅支持 Gemini 大模型。"
)
if __name__ == "__main__":
iface.launch() |