Spaces:

Hellowish
/

AnswerText

Sleeping

File size: 3,672 Bytes

40bd6e8
724cf70
2ddb6f3
 
724cf70
 
2ddb6f3
724cf70
 
2ddb6f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cac4759
 
 
 
 
 
2ddb6f3
cac4759
2ddb6f3
cac4759
 
 
2ddb6f3
cac4759
 
 
 
2ddb6f3
cac4759
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ddb6f3
 
 
 
 
724cf70
2ddb6f3
 
 
 
 
 
 
 
 
724cf70
2ddb6f3
 
 
 
 
 
 
 
 
b581f25
40bd6e8
b581f25

import gradio as gr
from transformers import pipeline
import pdfplumber
import docx

# 1. 載入 SQuAD v2.0 預訓練模型
# 使用 deepset/roberta-base-squad2，它是針對 v2.0 優化的標準模型
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")

# 2. 定義文件讀取函式
def extract_text(file):
    if file is None:
        return ""
    
    file_path = file.name
    text = ""
    
    # 處理 PDF
    if file_path.endswith('.pdf'):
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
    
    # 處理 Word (.docx)
    elif file_path.endswith('.docx'):
        doc = docx.Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
            
    # 處理純文字 (.txt)
    elif file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            
    return text

# 3. 定義主預測邏輯
def predict(file, manual_context, question):
    # 1. 檢查問題是否為空
    if not question.strip():
        return "⚠️ 請輸入您想提問的問題。"

    # 2. 判斷資料來源優先級
    source_info = ""
    if file is not None:
        # 如果有上傳檔案，優先讀取檔案
        context = extract_text(file)
        source_info = f"📝 來源：已偵測到上傳檔案 ({file.name.split('/')[-1]})"
    elif manual_context.strip():
        # 如果沒有檔案但有貼上文字
        context = manual_context
        source_info = "📝 來源：手動輸入的文本"
    else:
        # 兩者皆無
        return "⚠️ 請先提供文件內容（上傳檔案或是在文字框貼上內容）。"

    # 3. 檢查 Context 是否成功提取文字
    if not context.strip():
        return "⚠️ 無法從提供的來源中提取有效文字，請檢查檔案格式。"

    # 4. 執行模型推理
    try:
        result = qa_model(question=question, context=context)
        
        # SQuAD 2.0 門檻檢查
        if result['score'] < 0.01:
            return f"{source_info}\n\n❌ 抱歉，在提供的內容中找不到相關答案。"
        
        return (f"{source_info}\n"
                f"🎯 模型回答：{result['answer']}\n"
                f"📊 信心分數：{round(result['score'] * 100, 2)}%")
                
    except Exception as e:
        return f"❌ 發生錯誤：{str(e)}"

# 4. 建立 Gradio 網頁介面
with gr.Blocks(title="Case Study: AI Document QA") as demo:
    gr.Markdown("# 📑 Case Study: 智慧文件問答系統")
    gr.Markdown("利用語言模型進行文件自動化讀取與問答。")
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="1. 上傳文件 (PDF, Word, TXT)")
            text_input = gr.Textbox(lines=8, label="或是在此貼上文件內容", placeholder="若已上傳文件則無需填寫此處...")
            question_input = gr.Textbox(lines=2, label="2. 輸入您的問題", placeholder="例如：這份文件的主要結論是什麼？")
            submit_btn = gr.Button("開始分析", variant="primary")
            
        with gr.Column():
            answer_output = gr.Textbox(label="模型回答結果", lines=10)

    # 綁定按鈕功能
    submit_btn.click(
        fn=predict, 
        inputs=[file_input, text_input, question_input], 
        outputs=answer_output
    )
    
    gr.Markdown("---")
    gr.Markdown("💡 **提示：** 針對 SQuAD v2.0 資料集訓練的模型具備判斷『問題是否可回答』的能力。")

if __name__ == "__main__":
    demo.launch()