Spaces:

Yaoliang
/

fengkaobiguo

Sleeping

File size: 4,788 Bytes

import os
import PyPDF2
from docx import Document
import markdown
from bs4 import BeautifulSoup
from typing import List, Dict, Any
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document as LangchainDocument
import logging
import gradio as gr
from simple_qa import call_llm_api
from dotenv import load_dotenv
load_dotenv()

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DocumentProcessor:
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )
    
    def read_text_file(self, file_path: str) -> str:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except Exception as e:
            logger.error(f"读取文本文件失败: {e}")
            return ""
    
    def read_pdf_file(self, file_path: str) -> str:
        try:
            text = ""
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
            return text
        except Exception as e:
            logger.error(f"读取PDF文件失败: {e}")
            return ""
    
    def read_docx_file(self, file_path: str) -> str:
        try:
            doc = Document(file_path)
            text = ""
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
            return text
        except Exception as e:
            logger.error(f"读取Word文档失败: {e}")
            return ""
    
    def read_markdown_file(self, file_path: str) -> str:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                md_content = file.read()
                html = markdown.markdown(md_content)
                soup = BeautifulSoup(html, 'html.parser')
                return soup.get_text()
        except Exception as e:
            logger.error(f"读取Markdown文件失败: {e}")
            return ""
    
    def process_file(self, file_path: str) -> List[LangchainDocument]:
        file_extension = os.path.splitext(file_path)[1].lower()
        if file_extension == '.txt':
            content = self.read_text_file(file_path)
        elif file_extension == '.pdf':
            content = self.read_pdf_file(file_path)
        elif file_extension == '.docx':
            content = self.read_docx_file(file_path)
        elif file_extension == '.md':
            content = self.read_markdown_file(file_path)
        else:
            logger.warning(f"不支持的文件格式: {file_extension}")
            return []
        if not content.strip():
            logger.warning(f"文件内容为空: {file_path}")
            return []
        doc = LangchainDocument(
            page_content=content,
            metadata={
                "source": file_path,
                "file_type": file_extension,
                "file_name": os.path.basename(file_path)
            }
        )
        chunks = self.text_splitter.split_documents([doc])
        logger.info(f"文件 {file_path} 被分割为 {len(chunks)} 个块")
        return chunks
    
    def process_directory(self, directory_path: str) -> List[LangchainDocument]:
        all_chunks = []
        supported_formats = ['.txt', '.pdf', '.docx', '.md']
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                file_extension = os.path.splitext(file)[1].lower()
                if file_extension in supported_formats:
                    logger.info(f"处理文件: {file_path}")
                    chunks = self.process_file(file_path)
                    all_chunks.extend(chunks)
        logger.info(f"总共处理了 {len(all_chunks)} 个文档块")
        return all_chunks

def qa_func(text, model_type="deepseek", max_questions=5):
    result = call_llm_api(text, model_type=model_type, max_questions=max_questions)
    return result

iface = gr.Interface(
    fn=qa_func,
    inputs=[
        gr.Textbox(label="请输入你的文档内容或问题"),
        gr.Radio(["gemini"], label="选择模型"),
        gr.Slider(1, 10, value=5, label="生成问答对数量")
    ],
    outputs="text",
    title="逢考必过·AI考试复习助手",
    description="输入文档内容，自动生成高质量问答对，仅支持 Gemini 大模型。"
)

if __name__ == "__main__":
    iface.launch()