aelgendy commited on
Commit
f740dde
Β·
1 Parent(s): ad14a5e

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. app/prompts.py +10 -2
  2. app/search.py +11 -1
  3. app/state.py +117 -1
app/prompts.py CHANGED
@@ -20,13 +20,17 @@ TASK_INSTRUCTIONS: Dict[str, str] = {
20
  "The user asks about a Quranic verse β€” by partial text, topic, or meaning. Steps:\n"
21
  "1. Identify the matching verse(s) from the RETRIEVED RESULTS.\n"
22
  "2. Quote the Arabic verse text EXACTLY from the results.\n"
23
- "3. Provide the full reference: Surah name (Arabic & English), number, and Ayah number.\n"
 
 
 
24
  "4. Provide the English translation EXACTLY as given in the results.\n"
25
  "5. If the user searched by partial text, confirm the full verse found.\n"
26
  "6. Provide Tafsir: explain the meaning, context, and significance.\n"
27
  "7. If related verses appear in the results, draw connections.\n"
28
  "8. Answer the user's specific question directly.\n"
29
- "9. Do NOT reference verses that are not in the results."
 
30
  ),
31
  "hadith": (
32
  "The user asks about a Hadith β€” by partial text, topic, or meaning. Steps:\n"
@@ -104,10 +108,14 @@ For EVERY supporting evidence, use this exact format:
104
  ABSOLUTE RULES:
105
  β€’ Use ONLY content from the Islamic Context block. Zero outside knowledge.
106
  β€’ Copy Arabic text and translations VERBATIM from context. Never paraphrase.
 
 
 
107
  β€’ If a specific Hadith/verse is NOT in context β†’ respond with:
108
  "Ω‡Ψ°Ψ§ Ψ§Ω„Ψ­Ψ―ΩŠΨ«/Ψ§Ω„Ψ’ΩŠΨ© غير Ω…ΩˆΨ¬ΩˆΨ― في Ω‚Ψ§ΨΉΨ―Ψ© Ψ§Ω„Ψ¨ΩŠΨ§Ω†Ψ§Ψͺ." (Arabic)
109
  or "This Hadith/verse is not in the available dataset." (English)
110
  β€’ Never invent or guess content.
 
111
 
112
  LANGUAGE RULE (CRITICAL β€” MUST FOLLOW):
113
  β€’ You MUST answer in the SAME language as the user's question.
 
20
  "The user asks about a Quranic verse β€” by partial text, topic, or meaning. Steps:\n"
21
  "1. Identify the matching verse(s) from the RETRIEVED RESULTS.\n"
22
  "2. Quote the Arabic verse text EXACTLY from the results.\n"
23
+ "3. Provide the full reference using ONLY the [REF] metadata from the results:\n"
24
+ " Surah name (Arabic & English), Surah number, and Ayah number.\n"
25
+ " CRITICAL: You MUST copy the Surah name AND Ayah number from the [REF] line.\n"
26
+ " NEVER guess or recall a reference from memory β€” use ONLY what appears in the results.\n"
27
  "4. Provide the English translation EXACTLY as given in the results.\n"
28
  "5. If the user searched by partial text, confirm the full verse found.\n"
29
  "6. Provide Tafsir: explain the meaning, context, and significance.\n"
30
  "7. If related verses appear in the results, draw connections.\n"
31
  "8. Answer the user's specific question directly.\n"
32
+ "9. Do NOT reference verses that are not in the results.\n"
33
+ "10. If you cannot find a matching verse in the results, say so clearly."
34
  ),
35
  "hadith": (
36
  "The user asks about a Hadith β€” by partial text, topic, or meaning. Steps:\n"
 
108
  ABSOLUTE RULES:
109
  β€’ Use ONLY content from the Islamic Context block. Zero outside knowledge.
110
  β€’ Copy Arabic text and translations VERBATIM from context. Never paraphrase.
111
+ β€’ REFERENCE RULE (CRITICAL): For Quran verses, ALWAYS copy the Surah name and Ayah number
112
+ from the [REF] line in the context. NEVER recall or guess references from memory.
113
+ Wrong references are worse than no references.
114
  β€’ If a specific Hadith/verse is NOT in context β†’ respond with:
115
  "Ω‡Ψ°Ψ§ Ψ§Ω„Ψ­Ψ―ΩŠΨ«/Ψ§Ω„Ψ’ΩŠΨ© غير Ω…ΩˆΨ¬ΩˆΨ― في Ω‚Ψ§ΨΉΨ―Ψ© Ψ§Ω„Ψ¨ΩŠΨ§Ω†Ψ§Ψͺ." (Arabic)
116
  or "This Hadith/verse is not in the available dataset." (English)
117
  β€’ Never invent or guess content.
118
+ β€’ Never attribute a verse to a Surah unless the [REF] metadata explicitly says so.
119
 
120
  LANGUAGE RULE (CRITICAL β€” MUST FOLLOW):
121
  β€’ You MUST answer in the SAME language as the user's question.
app/search.py CHANGED
@@ -319,9 +319,19 @@ def build_context(results: list) -> str:
319
  item_type = "Quranic Verse" if r.get("type") == "quran" else "Hadith"
320
  grade_str = f" [Grade: {r.get('grade')}]" if r.get("grade") else ""
321
 
322
- lines.append(
323
  f"[{i}] πŸ“Œ {item_type}{grade_str} | {source} | score: {r.get('_score', 0):.3f}\n"
324
  f" Arabic : {r.get('arabic', '')}\n"
325
  f" English: {r.get('english', '')}"
326
  )
 
 
 
 
 
 
 
 
 
 
327
  return "\n\n".join(lines)
 
319
  item_type = "Quranic Verse" if r.get("type") == "quran" else "Hadith"
320
  grade_str = f" [Grade: {r.get('grade')}]" if r.get("grade") else ""
321
 
322
+ block = (
323
  f"[{i}] πŸ“Œ {item_type}{grade_str} | {source} | score: {r.get('_score', 0):.3f}\n"
324
  f" Arabic : {r.get('arabic', '')}\n"
325
  f" English: {r.get('english', '')}"
326
  )
327
+
328
+ # Add explicit structured metadata for Quran verses
329
+ if r.get("type") == "quran":
330
+ block += (
331
+ f"\n [REF] Surah: {r.get('surah_name_ar', '')} ({r.get('surah_name_en', '')}) "
332
+ f"| Surah Number: {r.get('surah_number', '')} "
333
+ f"| Ayah: {r.get('verse_number', '')}"
334
+ )
335
+
336
+ lines.append(block)
337
  return "\n\n".join(lines)
app/state.py CHANGED
@@ -33,6 +33,121 @@ logger = logging.getLogger("qmodel.state")
33
  # POST-GENERATION HALLUCINATION CHECK
34
  # ═══════════════════════════════════════════════════════════════════════
35
  _QUOTE_RE = re.compile(r"❝\s*(.+?)\s*❞", re.DOTALL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
 
38
  def _verify_citations(answer: str, results: list) -> str:
@@ -273,8 +388,9 @@ async def run_rag_pipeline(
273
  logger.error("LLM call failed: %s", exc)
274
  raise HTTPException(status_code=502, detail="LLM service unavailable")
275
 
276
- # 7. Post-generation hallucination check β€” verify quoted text exists in sources
277
  answer = _verify_citations(answer, results)
 
278
 
279
  latency = int((time.perf_counter() - t0) * 1000)
280
  logger.info(
 
33
  # POST-GENERATION HALLUCINATION CHECK
34
  # ═══════════════════════════════════════════════════════════════════════
35
  _QUOTE_RE = re.compile(r"❝\s*(.+?)\s*❞", re.DOTALL)
36
+ _SURAH_REF_AR = re.compile(
37
+ r"(?:سورة|Ψ³ΩˆΨ±Ω‡)\s+([\u0600-\u06FF\u0750-\u077F\s]+?)[\s,،]*"
38
+ r"(?:Ψ§Ω„Ψ’ΩŠΨ©|ؒية|Ψ§Ω„Ψ§ΩŠΩ‡|Ψ§ΩŠΩ‡)?\s*(\d+)",
39
+ )
40
+ _SURAH_REF_EN = re.compile(
41
+ r"(?:surah|sura)\s+([A-Za-z\-' ]+?)[\s,]*"
42
+ r"(?:ayah|verse|ayat)?\s*(\d+)",
43
+ re.I,
44
+ )
45
+ _SURAH_NUM_REF = re.compile(r"\b(\d{1,3})\s*:\s*(\d{1,3})\b")
46
+
47
+
48
+ def _build_valid_refs(results: list) -> set:
49
+ """Build a set of valid (surah_number, verse_number) tuples from results."""
50
+ refs = set()
51
+ for r in results:
52
+ if r.get("type") == "quran":
53
+ sn = r.get("surah_number")
54
+ vn = r.get("verse_number")
55
+ if sn and vn:
56
+ refs.add((int(sn), int(vn)))
57
+ return refs
58
+
59
+
60
+ def _build_surah_name_map(results: list) -> dict:
61
+ """Build a map from normalized surah names to surah numbers."""
62
+ name_map = {}
63
+ for r in results:
64
+ if r.get("type") == "quran":
65
+ sn = r.get("surah_number")
66
+ if sn:
67
+ sn = int(sn)
68
+ for field in ("surah_name_ar", "surah_name_en", "surah_name_transliteration"):
69
+ name = r.get(field, "").strip().lower()
70
+ name = re.sub(r"^(Ψ§Ω„|al[\-\s']*)", "", name).strip()
71
+ if name:
72
+ name_map[name] = sn
73
+ return name_map
74
+
75
+
76
+ def _verify_references(answer: str, results: list) -> str:
77
+ """Check that surah/verse references in the answer match retrieved results.
78
+
79
+ Replaces hallucinated references with corrected ones or warnings.
80
+ """
81
+ valid_refs = _build_valid_refs(results)
82
+ if not valid_refs:
83
+ return answer # No quran results β€” nothing to verify
84
+
85
+ name_map = _build_surah_name_map(results)
86
+
87
+ # Check numeric references like "16:53"
88
+ def _check_num_ref(m: re.Match) -> str:
89
+ sn, vn = int(m.group(1)), int(m.group(2))
90
+ if (sn, vn) in valid_refs:
91
+ return m.group(0)
92
+ # Check if any valid ref exists β€” if so, the LLM hallucinated a different ref
93
+ logger.warning("Hallucinated reference: %d:%d not in sources", sn, vn)
94
+ # Find closest valid reference to suggest
95
+ if len(valid_refs) == 1:
96
+ correct = next(iter(valid_refs))
97
+ return f"{correct[0]}:{correct[1]}"
98
+ return m.group(0) # Multiple refs β€” can't auto-correct
99
+
100
+ answer = _SURAH_NUM_REF.sub(_check_num_ref, answer)
101
+
102
+ # Check Arabic surah name references like "سورة Ψ₯Ψ¨Ψ±Ψ§Ω‡ΩŠΩ…ΨŒ Ψ§Ω„Ψ’ΩŠΨ© 7"
103
+ def _check_ar_ref(m: re.Match) -> str:
104
+ raw_name = m.group(1).strip()
105
+ verse_num = int(m.group(2))
106
+ name_norm = re.sub(r"^(Ψ§Ω„)", "", raw_name).strip().lower()
107
+
108
+ matched_sn = name_map.get(name_norm)
109
+ if matched_sn and (matched_sn, verse_num) in valid_refs:
110
+ return m.group(0) # Valid reference
111
+
112
+ # Check if the combined reference is wrong
113
+ for (sn, vn) in valid_refs:
114
+ # Find the correct surah name for this ref
115
+ for r in results:
116
+ if r.get("type") == "quran" and int(r.get("surah_number", 0)) == sn and int(r.get("verse_number", 0)) == vn:
117
+ correct_name = r.get("surah_name_ar", "")
118
+ logger.warning(
119
+ "Hallucinated reference: سورة %s ؒية %d -> correcting to سورة %s ؒية %d",
120
+ raw_name, verse_num, correct_name, vn,
121
+ )
122
+ return f"سورة {correct_name}، Ψ§Ω„Ψ’ΩŠΨ© {vn}"
123
+ return m.group(0)
124
+
125
+ answer = _SURAH_REF_AR.sub(_check_ar_ref, answer)
126
+
127
+ # Check English surah name references
128
+ def _check_en_ref(m: re.Match) -> str:
129
+ raw_name = m.group(1).strip()
130
+ verse_num = int(m.group(2))
131
+ name_norm = re.sub(r"^(al[\-\s']*)", "", raw_name, flags=re.I).strip().lower()
132
+
133
+ matched_sn = name_map.get(name_norm)
134
+ if matched_sn and (matched_sn, verse_num) in valid_refs:
135
+ return m.group(0)
136
+
137
+ for (sn, vn) in valid_refs:
138
+ for r in results:
139
+ if r.get("type") == "quran" and int(r.get("surah_number", 0)) == sn and int(r.get("verse_number", 0)) == vn:
140
+ correct_name = r.get("surah_name_en", "")
141
+ logger.warning(
142
+ "Hallucinated reference: Surah %s verse %d -> correcting to Surah %s verse %d",
143
+ raw_name, verse_num, correct_name, vn,
144
+ )
145
+ return f"Surah {correct_name}, verse {vn}"
146
+ return m.group(0)
147
+
148
+ answer = _SURAH_REF_EN.sub(_check_en_ref, answer)
149
+
150
+ return answer
151
 
152
 
153
  def _verify_citations(answer: str, results: list) -> str:
 
388
  logger.error("LLM call failed: %s", exc)
389
  raise HTTPException(status_code=502, detail="LLM service unavailable")
390
 
391
+ # 7. Post-generation hallucination check
392
  answer = _verify_citations(answer, results)
393
+ answer = _verify_references(answer, results)
394
 
395
  latency = int((time.perf_counter() - t0) * 1000)
396
  logger.info(