Spaces:

JemeinAI
/

bert_Chinese

Sleeping

App Files Files Community

JemeinAI commited on May 18, 2025

Commit

401d4ac

verified ·

1 Parent(s): 471cb28

Upload 2 files

Browse files

Files changed (2) hide show

app.py +93 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import gradio as gr
+from transformers import BertTokenizer, BertForQuestionAnswering
+import torch
+import fitz  # PyMuPDF for PDF
+import docx  # python-docx for Word files
+# Load model and tokenizer
+model_name = "cgt/Roberta-wwm-ext-large-qa"
+tokenizer = BertTokenizer.from_pretrained(model_name)
+model = BertForQuestionAnswering.from_pretrained(model_name)
+# Token limit for BERT (typically 512)
+MAX_TOKENS = 512
+# Extract text from files
+def extract_text_from_file(file):
+    if file.name.endswith(".txt"):
+        return file.read().decode("utf-8")
+    elif file.name.endswith(".pdf"):
+        text = ""
+        doc = fitz.open(stream=file.read(), filetype="pdf")
+        for page in doc:
+            text += page.get_text()
+        return text
+    elif file.name.endswith(".docx"):
+        doc = docx.Document(file)
+        return "\n".join([para.text for para in doc.paragraphs])
+    else:
+        return "❌ 不支持的文件格式"
+# Chunk large context
+def chunk_text(text, max_length=MAX_TOKENS):
+    tokens = tokenizer.tokenize(text)
+    chunks = []
+    for i in range(0, len(tokens), max_length - 50):  # leave room for question
+        chunk = tokens[i:i + max_length - 50]
+        chunks.append(tokenizer.convert_tokens_to_string(chunk))
+    return chunks
+# QA function
+def answer_question(context, question, file):
+    try:
+        if file:
+            context = extract_text_from_file(file)
+        if not context or not question:
+            return "⚠️ 请提供上下文和问题。"
+        best_answer = ""
+        best_score = -float("inf")
+        chunks = chunk_text(context)
+        for chunk in chunks:
+            inputs = tokenizer.encode_plus(question, chunk, return_tensors="pt", truncation=True)
+            input_ids = inputs["input_ids"].tolist()[0]
+            with torch.no_grad():
+                outputs = model(**inputs)
+            start_idx = torch.argmax(outputs.start_logits)
+            end_idx = torch.argmax(outputs.end_logits) + 1
+            answer = tokenizer.convert_tokens_to_string(
+                tokenizer.convert_ids_to_tokens(input_ids[start_idx:end_idx])
+            )
+            score = outputs.start_logits[0][start_idx] + outputs.end_logits[0][end_idx - 1]
+            if score > best_score and answer.strip():
+                best_answer = answer.strip()
+                best_score = score
+        return best_answer if best_answer else "🤔 没能从上下文中找到明确答案。"
+    except Exception as e:
+        return f"❌ 错误：{str(e)}"
+# Gradio Interface
+with gr.Blocks(title="中文BERT问答系统（含文档上传）") as demo:
+    gr.Markdown("## 📘 中文BERT问答系统\n支持 `.txt`、`.pdf`、`.docx` 文档上传或手动输入上下文。")
+    with gr.Row():
+        context_input = gr.Textbox(label="📝 上下文（可选）", placeholder="或上传文件", lines=6)
+        file_input = gr.File(label="📂 上传文档", file_types=[".txt", ".pdf", ".docx"])
+    question_input = gr.Textbox(label="❓ 问题", placeholder="请输入问题", lines=2)
+    answer_output = gr.Textbox(label="📌 答案", lines=3)
+    submit_btn = gr.Button("提交")
+    submit_btn.click(fn=answer_question, inputs=[context_input, question_input, file_input], outputs=answer_output)
+# 启动应用
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+transformers
+torch
+PyMuPDF
+python-docx