Spaces:
Sleeping
Sleeping
Commit ·
cfcbf08
1
Parent(s): 55ace82
that is enough for now
Browse files- app_gradio.py +52 -39
- core/__init__.py +0 -0
- core/auto_qa_hf.py +40 -31
- core/qa_hf.py +9 -11
- core/summarizer_hf.py +9 -14
app_gradio.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
|
| 2 |
import gradio as gr
|
| 3 |
from core.dataset import load_jsonl_dataset
|
| 4 |
from core.summarizer_hf import summarize_text
|
|
@@ -6,73 +5,87 @@ from core.auto_qa_hf import generate_questions
|
|
| 6 |
from core.qa_hf import answer_question_with_score
|
| 7 |
|
| 8 |
DATASET_PATH = "datasets/my_dataset.jsonl"
|
| 9 |
-
|
| 10 |
|
|
|
|
| 11 |
doc_map = {f"{d['id']} - {d['title']}": d["text"] for d in docs}
|
| 12 |
doc_choices = ["-- none --"] + list(doc_map.keys())
|
| 13 |
|
| 14 |
def load_doc(choice):
|
| 15 |
if choice == "-- none --":
|
| 16 |
return ""
|
| 17 |
-
return doc_map
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
def revision_mode(text, n_questions):
|
| 22 |
text = (text or "").strip()
|
| 23 |
if len(text) < 80:
|
| 24 |
-
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
|
| 28 |
-
#
|
| 29 |
-
|
| 30 |
|
| 31 |
-
|
| 32 |
-
for q in
|
| 33 |
res = answer_question_with_score(text, q)
|
| 34 |
ans, score = res["answer"], res["score"]
|
| 35 |
-
|
| 36 |
-
# ✅ filter out weak/empty answers
|
| 37 |
-
if score < 0.20 or len(ans) < 2:
|
| 38 |
continue
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
qa_text = ""
|
| 53 |
-
for i, (score, q, ans) in enumerate(
|
| 54 |
qa_text += f"Q{i}: {q}\nA{i}: {ans}\n(Confidence: {score:.2f})\n\n"
|
| 55 |
|
| 56 |
return summary, qa_text
|
| 57 |
|
| 58 |
|
| 59 |
with gr.Blocks() as demo:
|
| 60 |
-
gr.Markdown("# 🧠 AI
|
| 61 |
-
gr.Markdown("✅
|
| 62 |
|
| 63 |
with gr.Row():
|
| 64 |
-
|
| 65 |
-
|
|
|
|
| 66 |
|
| 67 |
-
text = gr.Textbox(label="📝 Text Input", lines=10
|
| 68 |
load_btn.click(load_doc, inputs=choice, outputs=text)
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
summary_out = gr.Textbox(label="✅ Ultra Short Summary", lines=4)
|
| 74 |
-
qa_out = gr.Textbox(label="✅ Auto Revision Questions & Answers", lines=14)
|
| 75 |
|
| 76 |
-
run_btn.click(revision_mode, inputs=[text,
|
| 77 |
|
| 78 |
demo.launch(share=True)
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from core.dataset import load_jsonl_dataset
|
| 3 |
from core.summarizer_hf import summarize_text
|
|
|
|
| 5 |
from core.qa_hf import answer_question_with_score
|
| 6 |
|
| 7 |
DATASET_PATH = "datasets/my_dataset.jsonl"
|
| 8 |
+
MIN_QUESTIONS = 3
|
| 9 |
|
| 10 |
+
docs = load_jsonl_dataset(DATASET_PATH)
|
| 11 |
doc_map = {f"{d['id']} - {d['title']}": d["text"] for d in docs}
|
| 12 |
doc_choices = ["-- none --"] + list(doc_map.keys())
|
| 13 |
|
| 14 |
def load_doc(choice):
|
| 15 |
if choice == "-- none --":
|
| 16 |
return ""
|
| 17 |
+
return doc_map.get(choice, "")
|
| 18 |
|
| 19 |
+
def revision_mode(text, lang):
|
|
|
|
|
|
|
| 20 |
text = (text or "").strip()
|
| 21 |
if len(text) < 80:
|
| 22 |
+
msg = "Texte trop court." if lang == "fr" else "Text too short."
|
| 23 |
+
return msg, msg
|
| 24 |
|
| 25 |
+
summary = summarize_text(text, lang=lang)
|
| 26 |
|
| 27 |
+
# generate many questions -> rank by QA confidence -> pick best 3
|
| 28 |
+
candidates = generate_questions(text, lang=lang, n_questions=60)
|
| 29 |
|
| 30 |
+
scored = []
|
| 31 |
+
for q in candidates:
|
| 32 |
res = answer_question_with_score(text, q)
|
| 33 |
ans, score = res["answer"], res["score"]
|
| 34 |
+
if score < 0.10 or len(ans) < 2:
|
|
|
|
|
|
|
| 35 |
continue
|
| 36 |
+
scored.append((score, q, ans))
|
| 37 |
+
|
| 38 |
+
scored.sort(key=lambda x: x[0], reverse=True)
|
| 39 |
+
top = scored[:MIN_QUESTIONS]
|
| 40 |
+
|
| 41 |
+
# second chance: generate from summary
|
| 42 |
+
if len(top) < MIN_QUESTIONS:
|
| 43 |
+
candidates2 = generate_questions(summary, lang=lang, n_questions=60)
|
| 44 |
+
for q in candidates2:
|
| 45 |
+
res = answer_question_with_score(text, q)
|
| 46 |
+
ans, score = res["answer"], res["score"]
|
| 47 |
+
if score < 0.08 or len(ans) < 2:
|
| 48 |
+
continue
|
| 49 |
+
scored.append((score, q, ans))
|
| 50 |
+
scored.sort(key=lambda x: x[0], reverse=True)
|
| 51 |
+
top = scored[:MIN_QUESTIONS]
|
| 52 |
+
|
| 53 |
+
# final fallback (GENERAL questions only)
|
| 54 |
+
if len(top) < MIN_QUESTIONS:
|
| 55 |
+
generic = (
|
| 56 |
+
["De quoi parle le texte ?", "Quel est le fait le plus important ?", "Quelle est la conclusion principale ?"]
|
| 57 |
+
if lang == "fr"
|
| 58 |
+
else ["What is the text about?", "What is the most important fact?", "What is the main conclusion?"]
|
| 59 |
+
)
|
| 60 |
+
for q in generic:
|
| 61 |
+
res = answer_question_with_score(text, q)
|
| 62 |
+
ans, score = res["answer"], res["score"]
|
| 63 |
+
if score > 0.03 and len(ans) > 1:
|
| 64 |
+
top.append((score, q, ans))
|
| 65 |
|
| 66 |
qa_text = ""
|
| 67 |
+
for i, (score, q, ans) in enumerate(top[:MIN_QUESTIONS], start=1):
|
| 68 |
qa_text += f"Q{i}: {q}\nA{i}: {ans}\n(Confidence: {score:.2f})\n\n"
|
| 69 |
|
| 70 |
return summary, qa_text
|
| 71 |
|
| 72 |
|
| 73 |
with gr.Blocks() as demo:
|
| 74 |
+
gr.Markdown("# 🧠 AI Summarizer + Auto Q&A (EN/FR)")
|
| 75 |
+
gr.Markdown("✅ Generates short summary + **3 automatic revision questions** with answers.")
|
| 76 |
|
| 77 |
with gr.Row():
|
| 78 |
+
lang = gr.Radio(["en", "fr"], value="en", label="Language / Langue")
|
| 79 |
+
choice = gr.Dropdown(doc_choices, label="📚 Dataset Document")
|
| 80 |
+
load_btn = gr.Button("Load Doc")
|
| 81 |
|
| 82 |
+
text = gr.Textbox(label="📝 Text Input", lines=10)
|
| 83 |
load_btn.click(load_doc, inputs=choice, outputs=text)
|
| 84 |
|
| 85 |
+
run_btn = gr.Button("🚀 Generate (Summary + 3 Q&A)")
|
| 86 |
+
summary_out = gr.Textbox(label="✅ Summary / Résumé", lines=4)
|
| 87 |
+
qa_out = gr.Textbox(label="✅ Questions & Answers", lines=12)
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
run_btn.click(revision_mode, inputs=[text, lang], outputs=[summary_out, qa_out])
|
| 90 |
|
| 91 |
demo.launch(share=True)
|
core/__init__.py
ADDED
|
File without changes
|
core/auto_qa_hf.py
CHANGED
|
@@ -1,41 +1,50 @@
|
|
| 1 |
from transformers import pipeline
|
| 2 |
|
| 3 |
-
_qg = pipeline(
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
)
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
seen = set()
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
| 28 |
final.append(q)
|
| 29 |
-
seen.add(
|
| 30 |
if len(final) >= n_questions:
|
| 31 |
break
|
| 32 |
|
| 33 |
-
# fallback if model returns nothing
|
| 34 |
if not final:
|
| 35 |
-
final = [
|
| 36 |
-
|
| 37 |
-
"What are the key points?",
|
| 38 |
-
"Why is it important?"
|
| 39 |
-
][:n_questions]
|
| 40 |
|
| 41 |
return final
|
|
|
|
| 1 |
from transformers import pipeline
|
| 2 |
|
| 3 |
+
_qg = pipeline("text2text-generation", model="google/mt5-base")
|
| 4 |
+
|
| 5 |
+
def generate_questions(text: str, lang: str = "en", n_questions: int = 50):
|
| 6 |
+
text = (text or "").strip()
|
| 7 |
+
if len(text) < 80:
|
| 8 |
+
return ["Quel est le sujet principal ?"] if lang == "fr" else ["What is the main topic?"]
|
| 9 |
+
|
| 10 |
+
if lang == "fr":
|
| 11 |
+
prompt = (
|
| 12 |
+
"Génère 10 questions courtes de révision basées uniquement sur ce texte. "
|
| 13 |
+
"Une question par ligne.\n\n"
|
| 14 |
+
f"TEXTE:\n{text}"
|
| 15 |
+
)
|
| 16 |
+
else:
|
| 17 |
+
prompt = (
|
| 18 |
+
"Generate 10 short revision questions based only on this text. "
|
| 19 |
+
"One question per line.\n\n"
|
| 20 |
+
f"TEXT:\n{text}"
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
out = _qg(prompt, max_length=256, do_sample=False)[0]["generated_text"]
|
| 24 |
+
lines = [l.strip() for l in out.split("\n") if l.strip()]
|
| 25 |
+
|
| 26 |
+
questions = []
|
| 27 |
+
for l in lines:
|
| 28 |
+
l = l.lstrip("-• ").strip()
|
| 29 |
+
if len(l) < 6:
|
| 30 |
+
continue
|
| 31 |
+
if not l.endswith("?"):
|
| 32 |
+
l += "?"
|
| 33 |
+
questions.append(l)
|
| 34 |
+
|
| 35 |
+
# dedupe + limit
|
| 36 |
seen = set()
|
| 37 |
+
final = []
|
| 38 |
+
for q in questions:
|
| 39 |
+
qn = q.lower()
|
| 40 |
+
if qn not in seen:
|
| 41 |
final.append(q)
|
| 42 |
+
seen.add(qn)
|
| 43 |
if len(final) >= n_questions:
|
| 44 |
break
|
| 45 |
|
|
|
|
| 46 |
if not final:
|
| 47 |
+
final = ["Quel est le sujet principal ?", "Quels sont les points importants ?", "Quelle est la conclusion ?"] if lang == "fr" else \
|
| 48 |
+
["What is the main topic?", "What are the key points?", "What is the conclusion?"]
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
return final
|
core/qa_hf.py
CHANGED
|
@@ -2,22 +2,20 @@ from transformers import pipeline
|
|
| 2 |
|
| 3 |
_qa = pipeline(
|
| 4 |
"question-answering",
|
| 5 |
-
model="deepset/roberta-
|
| 6 |
-
device=0
|
| 7 |
)
|
| 8 |
|
| 9 |
def answer_question_with_score(context: str, question: str):
|
| 10 |
-
context = context.strip()
|
| 11 |
-
question = question.strip()
|
| 12 |
if not context or not question:
|
| 13 |
return {"answer": "", "score": 0.0}
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
score = float(
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
score = 0.0
|
| 22 |
|
| 23 |
-
return {"answer":
|
|
|
|
| 2 |
|
| 3 |
_qa = pipeline(
|
| 4 |
"question-answering",
|
| 5 |
+
model="deepset/xlm-roberta-large-squad2"
|
|
|
|
| 6 |
)
|
| 7 |
|
| 8 |
def answer_question_with_score(context: str, question: str):
|
| 9 |
+
context = (context or "").strip()
|
| 10 |
+
question = (question or "").strip()
|
| 11 |
if not context or not question:
|
| 12 |
return {"answer": "", "score": 0.0}
|
| 13 |
|
| 14 |
+
res = _qa(question=question, context=context)
|
| 15 |
+
ans = (res.get("answer") or "").strip()
|
| 16 |
+
score = float(res.get("score", 0.0))
|
| 17 |
|
| 18 |
+
if not ans:
|
| 19 |
+
return {"answer": "", "score": 0.0}
|
|
|
|
| 20 |
|
| 21 |
+
return {"answer": ans, "score": score}
|
core/summarizer_hf.py
CHANGED
|
@@ -1,21 +1,16 @@
|
|
| 1 |
from transformers import pipeline
|
| 2 |
|
|
|
|
| 3 |
_summarizer = pipeline(
|
| 4 |
-
"
|
| 5 |
-
model="
|
| 6 |
-
device=0
|
| 7 |
)
|
| 8 |
|
| 9 |
-
def summarize_text(text: str) -> str:
|
| 10 |
-
text = text.strip()
|
| 11 |
if len(text) < 80:
|
| 12 |
-
return "
|
| 13 |
|
| 14 |
-
|
| 15 |
-
out = _summarizer(
|
| 16 |
-
|
| 17 |
-
max_length=60, # كان 160
|
| 18 |
-
min_length=20, # كان 40
|
| 19 |
-
do_sample=False
|
| 20 |
-
)
|
| 21 |
-
return out[0]["summary_text"]
|
|
|
|
| 1 |
from transformers import pipeline
|
| 2 |
|
| 3 |
+
# Multilingual summarizer
|
| 4 |
_summarizer = pipeline(
|
| 5 |
+
"text2text-generation",
|
| 6 |
+
model="csebuetnlp/mT5_multilingual_XLSum"
|
|
|
|
| 7 |
)
|
| 8 |
|
| 9 |
+
def summarize_text(text: str, lang: str = "en") -> str:
|
| 10 |
+
text = (text or "").strip()
|
| 11 |
if len(text) < 80:
|
| 12 |
+
return "Texte trop court pour résumer." if lang == "fr" else "Text too short to summarize."
|
| 13 |
|
| 14 |
+
prompt = f"summarize: {text}"
|
| 15 |
+
out = _summarizer(prompt, max_length=80, do_sample=False)
|
| 16 |
+
return out[0]["generated_text"].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|