mdvallahmedomar commited on
Commit
cfcbf08
·
1 Parent(s): 55ace82

that is enough for now

Browse files
Files changed (5) hide show
  1. app_gradio.py +52 -39
  2. core/__init__.py +0 -0
  3. core/auto_qa_hf.py +40 -31
  4. core/qa_hf.py +9 -11
  5. core/summarizer_hf.py +9 -14
app_gradio.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import gradio as gr
3
  from core.dataset import load_jsonl_dataset
4
  from core.summarizer_hf import summarize_text
@@ -6,73 +5,87 @@ from core.auto_qa_hf import generate_questions
6
  from core.qa_hf import answer_question_with_score
7
 
8
  DATASET_PATH = "datasets/my_dataset.jsonl"
9
- docs = load_jsonl_dataset(DATASET_PATH)
10
 
 
11
  doc_map = {f"{d['id']} - {d['title']}": d["text"] for d in docs}
12
  doc_choices = ["-- none --"] + list(doc_map.keys())
13
 
14
  def load_doc(choice):
15
  if choice == "-- none --":
16
  return ""
17
- return doc_map[choice]
18
 
19
-
20
-
21
- def revision_mode(text, n_questions):
22
  text = (text or "").strip()
23
  if len(text) < 80:
24
- return "Text too short.", "Please provide a longer text."
 
25
 
26
- target = int(n_questions)
27
 
28
- # Generate MANY questions to increase chance of good ones
29
- questions = generate_questions(text, n_questions=target * 6)
30
 
31
- scored_pairs = []
32
- for q in questions:
33
  res = answer_question_with_score(text, q)
34
  ans, score = res["answer"], res["score"]
35
-
36
- # ✅ filter out weak/empty answers
37
- if score < 0.20 or len(ans) < 2:
38
  continue
39
-
40
- scored_pairs.append((score, q, ans))
41
-
42
- # ✅ Sort by confidence score and keep top N
43
- scored_pairs.sort(key=lambda x: x[0], reverse=True)
44
- top_pairs = scored_pairs[:target]
45
-
46
- summary = summarize_text(text)
47
-
48
- # If still empty, show message (NO hardcoded topic questions)
49
- if not top_pairs:
50
- return summary, "Could not generate strong Q&A pairs. Try a longer text or increase text clarity."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  qa_text = ""
53
- for i, (score, q, ans) in enumerate(top_pairs, start=1):
54
  qa_text += f"Q{i}: {q}\nA{i}: {ans}\n(Confidence: {score:.2f})\n\n"
55
 
56
  return summary, qa_text
57
 
58
 
59
  with gr.Blocks() as demo:
60
- gr.Markdown("# 🧠 AI Revision App (Summary + Auto Q&A)")
61
- gr.Markdown("✅ Generate a short summary + revision questions with answers extracted from the text.")
62
 
63
  with gr.Row():
64
- choice = gr.Dropdown(doc_choices, label="📚 Choose Dataset Document")
65
- load_btn = gr.Button("Load Document")
 
66
 
67
- text = gr.Textbox(label="📝 Text Input", lines=10, placeholder="Paste your text or load from dataset...")
68
  load_btn.click(load_doc, inputs=choice, outputs=text)
69
 
70
- n_questions = gr.Slider(3, 10, value=5, step=1, label="Number of Revision Questions")
71
- run_btn = gr.Button("🚀 Generate Revision (Summary + Auto Q&A)")
72
-
73
- summary_out = gr.Textbox(label="✅ Ultra Short Summary", lines=4)
74
- qa_out = gr.Textbox(label="✅ Auto Revision Questions & Answers", lines=14)
75
 
76
- run_btn.click(revision_mode, inputs=[text, n_questions], outputs=[summary_out, qa_out])
77
 
78
  demo.launch(share=True)
 
 
1
  import gradio as gr
2
  from core.dataset import load_jsonl_dataset
3
  from core.summarizer_hf import summarize_text
 
5
  from core.qa_hf import answer_question_with_score
6
 
7
  DATASET_PATH = "datasets/my_dataset.jsonl"
8
+ MIN_QUESTIONS = 3
9
 
10
+ docs = load_jsonl_dataset(DATASET_PATH)
11
  doc_map = {f"{d['id']} - {d['title']}": d["text"] for d in docs}
12
  doc_choices = ["-- none --"] + list(doc_map.keys())
13
 
14
  def load_doc(choice):
15
  if choice == "-- none --":
16
  return ""
17
+ return doc_map.get(choice, "")
18
 
19
+ def revision_mode(text, lang):
 
 
20
  text = (text or "").strip()
21
  if len(text) < 80:
22
+ msg = "Texte trop court." if lang == "fr" else "Text too short."
23
+ return msg, msg
24
 
25
+ summary = summarize_text(text, lang=lang)
26
 
27
+ # generate many questions -> rank by QA confidence -> pick best 3
28
+ candidates = generate_questions(text, lang=lang, n_questions=60)
29
 
30
+ scored = []
31
+ for q in candidates:
32
  res = answer_question_with_score(text, q)
33
  ans, score = res["answer"], res["score"]
34
+ if score < 0.10 or len(ans) < 2:
 
 
35
  continue
36
+ scored.append((score, q, ans))
37
+
38
+ scored.sort(key=lambda x: x[0], reverse=True)
39
+ top = scored[:MIN_QUESTIONS]
40
+
41
+ # second chance: generate from summary
42
+ if len(top) < MIN_QUESTIONS:
43
+ candidates2 = generate_questions(summary, lang=lang, n_questions=60)
44
+ for q in candidates2:
45
+ res = answer_question_with_score(text, q)
46
+ ans, score = res["answer"], res["score"]
47
+ if score < 0.08 or len(ans) < 2:
48
+ continue
49
+ scored.append((score, q, ans))
50
+ scored.sort(key=lambda x: x[0], reverse=True)
51
+ top = scored[:MIN_QUESTIONS]
52
+
53
+ # final fallback (GENERAL questions only)
54
+ if len(top) < MIN_QUESTIONS:
55
+ generic = (
56
+ ["De quoi parle le texte ?", "Quel est le fait le plus important ?", "Quelle est la conclusion principale ?"]
57
+ if lang == "fr"
58
+ else ["What is the text about?", "What is the most important fact?", "What is the main conclusion?"]
59
+ )
60
+ for q in generic:
61
+ res = answer_question_with_score(text, q)
62
+ ans, score = res["answer"], res["score"]
63
+ if score > 0.03 and len(ans) > 1:
64
+ top.append((score, q, ans))
65
 
66
  qa_text = ""
67
+ for i, (score, q, ans) in enumerate(top[:MIN_QUESTIONS], start=1):
68
  qa_text += f"Q{i}: {q}\nA{i}: {ans}\n(Confidence: {score:.2f})\n\n"
69
 
70
  return summary, qa_text
71
 
72
 
73
  with gr.Blocks() as demo:
74
+ gr.Markdown("# 🧠 AI Summarizer + Auto Q&A (EN/FR)")
75
+ gr.Markdown("✅ Generates short summary + **3 automatic revision questions** with answers.")
76
 
77
  with gr.Row():
78
+ lang = gr.Radio(["en", "fr"], value="en", label="Language / Langue")
79
+ choice = gr.Dropdown(doc_choices, label="📚 Dataset Document")
80
+ load_btn = gr.Button("Load Doc")
81
 
82
+ text = gr.Textbox(label="📝 Text Input", lines=10)
83
  load_btn.click(load_doc, inputs=choice, outputs=text)
84
 
85
+ run_btn = gr.Button("🚀 Generate (Summary + 3 Q&A)")
86
+ summary_out = gr.Textbox(label=" Summary / Résumé", lines=4)
87
+ qa_out = gr.Textbox(label="✅ Questions & Answers", lines=12)
 
 
88
 
89
+ run_btn.click(revision_mode, inputs=[text, lang], outputs=[summary_out, qa_out])
90
 
91
  demo.launch(share=True)
core/__init__.py ADDED
File without changes
core/auto_qa_hf.py CHANGED
@@ -1,41 +1,50 @@
1
  from transformers import pipeline
2
 
3
- _qg = pipeline(
4
- "text2text-generation",
5
- model="iarfmoose/t5-base-question-generator",
6
- device=0
7
- )
8
-
9
- def generate_questions(text: str, n_questions: int = 5):
10
- text = text.strip()
11
- if len(text) < 40:
12
- return ["What is the main idea?"]
13
-
14
- prompt = f"generate questions: {text}"
15
- out = _qg(prompt, max_length=256, do_sample=False)
16
- gen = out[0]["generated_text"]
17
-
18
- # split by ?
19
- raw = [q.strip() for q in gen.split("?") if q.strip()]
20
- qs = [q + "?" for q in raw]
21
-
22
- # clean duplicates
23
- final = []
 
 
 
 
 
 
 
 
 
 
 
 
24
  seen = set()
25
- for q in qs:
26
- q_norm = q.lower()
27
- if q_norm not in seen and len(q) > 5:
 
28
  final.append(q)
29
- seen.add(q_norm)
30
  if len(final) >= n_questions:
31
  break
32
 
33
- # fallback if model returns nothing
34
  if not final:
35
- final = [
36
- "What is the main topic?",
37
- "What are the key points?",
38
- "Why is it important?"
39
- ][:n_questions]
40
 
41
  return final
 
1
  from transformers import pipeline
2
 
3
+ _qg = pipeline("text2text-generation", model="google/mt5-base")
4
+
5
+ def generate_questions(text: str, lang: str = "en", n_questions: int = 50):
6
+ text = (text or "").strip()
7
+ if len(text) < 80:
8
+ return ["Quel est le sujet principal ?"] if lang == "fr" else ["What is the main topic?"]
9
+
10
+ if lang == "fr":
11
+ prompt = (
12
+ "Génère 10 questions courtes de révision basées uniquement sur ce texte. "
13
+ "Une question par ligne.\n\n"
14
+ f"TEXTE:\n{text}"
15
+ )
16
+ else:
17
+ prompt = (
18
+ "Generate 10 short revision questions based only on this text. "
19
+ "One question per line.\n\n"
20
+ f"TEXT:\n{text}"
21
+ )
22
+
23
+ out = _qg(prompt, max_length=256, do_sample=False)[0]["generated_text"]
24
+ lines = [l.strip() for l in out.split("\n") if l.strip()]
25
+
26
+ questions = []
27
+ for l in lines:
28
+ l = l.lstrip("-• ").strip()
29
+ if len(l) < 6:
30
+ continue
31
+ if not l.endswith("?"):
32
+ l += "?"
33
+ questions.append(l)
34
+
35
+ # dedupe + limit
36
  seen = set()
37
+ final = []
38
+ for q in questions:
39
+ qn = q.lower()
40
+ if qn not in seen:
41
  final.append(q)
42
+ seen.add(qn)
43
  if len(final) >= n_questions:
44
  break
45
 
 
46
  if not final:
47
+ final = ["Quel est le sujet principal ?", "Quels sont les points importants ?", "Quelle est la conclusion ?"] if lang == "fr" else \
48
+ ["What is the main topic?", "What are the key points?", "What is the conclusion?"]
 
 
 
49
 
50
  return final
core/qa_hf.py CHANGED
@@ -2,22 +2,20 @@ from transformers import pipeline
2
 
3
  _qa = pipeline(
4
  "question-answering",
5
- model="deepset/roberta-base-squad2",
6
- device=0
7
  )
8
 
9
  def answer_question_with_score(context: str, question: str):
10
- context = context.strip()
11
- question = question.strip()
12
  if not context or not question:
13
  return {"answer": "", "score": 0.0}
14
 
15
- result = _qa(question=question, context=context)
16
- answer = (result.get("answer") or "").strip()
17
- score = float(result.get("score", 0.0))
18
 
19
- # if model returns empty / nonsense
20
- if not answer:
21
- score = 0.0
22
 
23
- return {"answer": answer, "score": score}
 
2
 
3
  _qa = pipeline(
4
  "question-answering",
5
+ model="deepset/xlm-roberta-large-squad2"
 
6
  )
7
 
8
  def answer_question_with_score(context: str, question: str):
9
+ context = (context or "").strip()
10
+ question = (question or "").strip()
11
  if not context or not question:
12
  return {"answer": "", "score": 0.0}
13
 
14
+ res = _qa(question=question, context=context)
15
+ ans = (res.get("answer") or "").strip()
16
+ score = float(res.get("score", 0.0))
17
 
18
+ if not ans:
19
+ return {"answer": "", "score": 0.0}
 
20
 
21
+ return {"answer": ans, "score": score}
core/summarizer_hf.py CHANGED
@@ -1,21 +1,16 @@
1
  from transformers import pipeline
2
 
 
3
  _summarizer = pipeline(
4
- "summarization",
5
- model="facebook/bart-large-cnn",
6
- device=0
7
  )
8
 
9
- def summarize_text(text: str) -> str:
10
- text = text.strip()
11
  if len(text) < 80:
12
- return "النص قصير جدًا للتلخيص."
13
 
14
- # Ultra short summary
15
- out = _summarizer(
16
- text,
17
- max_length=60, # كان 160
18
- min_length=20, # كان 40
19
- do_sample=False
20
- )
21
- return out[0]["summary_text"]
 
1
  from transformers import pipeline
2
 
3
+ # Multilingual summarizer
4
  _summarizer = pipeline(
5
+ "text2text-generation",
6
+ model="csebuetnlp/mT5_multilingual_XLSum"
 
7
  )
8
 
9
+ def summarize_text(text: str, lang: str = "en") -> str:
10
+ text = (text or "").strip()
11
  if len(text) < 80:
12
+ return "Texte trop court pour résumer." if lang == "fr" else "Text too short to summarize."
13
 
14
+ prompt = f"summarize: {text}"
15
+ out = _summarizer(prompt, max_length=80, do_sample=False)
16
+ return out[0]["generated_text"].strip()