File size: 4,788 Bytes
90d1485
 
 
 
 
 
 
 
 
63306e9
 
cf6c1df
 
90d1485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c30406d
 
63306e9
 
 
c30406d
63306e9
f20f7ac
 
 
221e227
f20f7ac
 
 
 
221e227
f20f7ac
c30406d
63306e9
f20f7ac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import PyPDF2
from docx import Document
import markdown
from bs4 import BeautifulSoup
from typing import List, Dict, Any
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document as LangchainDocument
import logging
import gradio as gr
from simple_qa import call_llm_api
from dotenv import load_dotenv
load_dotenv()

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DocumentProcessor:
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )
    
    def read_text_file(self, file_path: str) -> str:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except Exception as e:
            logger.error(f"读取文本文件失败: {e}")
            return ""
    
    def read_pdf_file(self, file_path: str) -> str:
        try:
            text = ""
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
            return text
        except Exception as e:
            logger.error(f"读取PDF文件失败: {e}")
            return ""
    
    def read_docx_file(self, file_path: str) -> str:
        try:
            doc = Document(file_path)
            text = ""
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
            return text
        except Exception as e:
            logger.error(f"读取Word文档失败: {e}")
            return ""
    
    def read_markdown_file(self, file_path: str) -> str:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                md_content = file.read()
                html = markdown.markdown(md_content)
                soup = BeautifulSoup(html, 'html.parser')
                return soup.get_text()
        except Exception as e:
            logger.error(f"读取Markdown文件失败: {e}")
            return ""
    
    def process_file(self, file_path: str) -> List[LangchainDocument]:
        file_extension = os.path.splitext(file_path)[1].lower()
        if file_extension == '.txt':
            content = self.read_text_file(file_path)
        elif file_extension == '.pdf':
            content = self.read_pdf_file(file_path)
        elif file_extension == '.docx':
            content = self.read_docx_file(file_path)
        elif file_extension == '.md':
            content = self.read_markdown_file(file_path)
        else:
            logger.warning(f"不支持的文件格式: {file_extension}")
            return []
        if not content.strip():
            logger.warning(f"文件内容为空: {file_path}")
            return []
        doc = LangchainDocument(
            page_content=content,
            metadata={
                "source": file_path,
                "file_type": file_extension,
                "file_name": os.path.basename(file_path)
            }
        )
        chunks = self.text_splitter.split_documents([doc])
        logger.info(f"文件 {file_path} 被分割为 {len(chunks)} 个块")
        return chunks
    
    def process_directory(self, directory_path: str) -> List[LangchainDocument]:
        all_chunks = []
        supported_formats = ['.txt', '.pdf', '.docx', '.md']
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                file_extension = os.path.splitext(file)[1].lower()
                if file_extension in supported_formats:
                    logger.info(f"处理文件: {file_path}")
                    chunks = self.process_file(file_path)
                    all_chunks.extend(chunks)
        logger.info(f"总共处理了 {len(all_chunks)} 个文档块")
        return all_chunks

def qa_func(text, model_type="deepseek", max_questions=5):
    result = call_llm_api(text, model_type=model_type, max_questions=max_questions)
    return result

iface = gr.Interface(
    fn=qa_func,
    inputs=[
        gr.Textbox(label="请输入你的文档内容或问题"),
        gr.Radio(["gemini"], label="选择模型"),
        gr.Slider(1, 10, value=5, label="生成问答对数量")
    ],
    outputs="text",
    title="逢考必过·AI考试复习助手",
    description="输入文档内容,自动生成高质量问答对,仅支持 Gemini 大模型。"
)

if __name__ == "__main__":
    iface.launch()