Spaces:
Sleeping
Sleeping
File size: 8,103 Bytes
2f4af3f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | import json
import os
import re
from typing import Dict, List, Tuple
from config import Config
class RAGService:
def __init__(self, references_path: str = None):
self.config = Config()
self.references_path = references_path or str(self.config.REFERENCES_PATH)
self.corpus = []
self.load_corpus()
def load_corpus(self):
"""Load and index the historical reference document corpus"""
try:
if os.path.exists(self.references_path):
with open(self.references_path, "r", encoding="utf-8") as f:
data = json.load(f)
# Index Egyptian
for term, note in data.get("egypt_symbol_notes", {}).items():
self.corpus.append({
"category": "Egyptian Hieroglyphic Sign",
"term": term,
"definition": note,
"citation": "Gardiner, A. H. (1957). Egyptian Grammar: Being an Introduction to the Study of Hieroglyphs. Oxford: Griffith Institute."
})
# Index Greek
for term, note in data.get("greek_symbol_notes", {}).items():
self.corpus.append({
"category": "Greek Paleography Mark",
"term": term,
"definition": note,
"citation": "Thompson, E. M. (1912). An Introduction to Greek and Latin Palaeography. Oxford: Clarendon Press."
})
# Index Latin
for term, note in data.get("latin_symbol_notes", {}).items():
self.corpus.append({
"category": "Latin Scribal Abbreviation",
"term": term,
"definition": note,
"citation": "Cappelli, A. (1928). Dizionario di Abbreviature Latine ed Italiane. Milan: Hoepli."
})
# Index Cuneiform
for term, note in data.get("cuneiform_symbol_notes", {}).items():
self.corpus.append({
"category": "Mesopotamian Cuneiform Logogram",
"term": term,
"definition": note,
"citation": "Borger, R. (2004). Mesopotamisches Zeichenlexikon. Münster: Ugarit-Verlag."
})
print(f"[INFO] RAG Service successfully indexed {len(self.corpus)} reference records.")
else:
print(f"[WARN] Reference path {self.references_path} not found for RAG index.")
except Exception as e:
print(f"[ERROR] Failed to initialize RAG index: {e}")
def retrieve_grounding_context(self, query_terms: List[str], max_results: int = 5) -> str:
"""Search reference records and build a grounding context string with academic citations"""
if not query_terms or not self.corpus:
return ""
matches = []
seen = set()
for term in query_terms:
if not term or len(term.strip()) < 1:
continue
clean_term = term.lower().strip()
# Simple keyword search with scoring
for record in self.corpus:
score = 0
record_term = record["term"].lower()
record_def = record["definition"].lower()
if clean_term == record_term:
score += 10
else:
# Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
term_parts = re.split(r'[_ \-]', record_term)
if clean_term in term_parts:
score += 5
elif len(clean_term) > 3:
if clean_term in record_term:
score += 5
elif clean_term in record_def:
score += 2
if score > 0:
record_key = f"{record['category']}:{record['term']}"
if record_key not in seen:
seen.add(record_key)
matches.append((score, record))
# Sort matches by relevance score
matches.sort(key=lambda x: x[0], reverse=True)
top_matches = [m[1] for m in matches[:max_results]]
if not top_matches:
return ""
context_lines = ["### Scholarly Grounding and Sign References:"]
for idx, match in enumerate(top_matches, 1):
context_lines.append(
f"{idx}. **[{match['category']}] '{match['term']}'**: {match['definition']}\n"
f" *Source Citation:* {match['citation']}"
)
return "\n".join(context_lines)
def retrieve_grounding_list(self, query_terms: List[str], max_results: int = 5) -> List[Dict]:
"""Search reference records and return the raw list of matching reference dicts with citations"""
if not query_terms or not self.corpus:
return []
matches = []
seen = set()
for term in query_terms:
if not term or len(term.strip()) < 1:
continue
clean_term = term.lower().strip()
# Simple keyword search with scoring
for record in self.corpus:
score = 0
record_term = record["term"].lower()
record_def = record["definition"].lower()
if clean_term == record_term:
score += 10
else:
# Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
term_parts = re.split(r'[_ \-]', record_term)
if clean_term in term_parts:
score += 5
elif len(clean_term) > 3:
if clean_term in record_term:
score += 5
elif clean_term in record_def:
score += 2
if score > 0:
record_key = f"{record['category']}:{record['term']}"
if record_key not in seen:
seen.add(record_key)
matches.append((score, record))
# Sort matches by relevance score
matches.sort(key=lambda x: x[0], reverse=True)
return [m[1] for m in matches[:max_results]]
def enrich_prompt(self, base_system_prompt: str, extracted_text: str, extracted_symbols: List[str] = None) -> str:
"""Enrich LLM prompts with RAG context and citation grounding instructions"""
# Parse query terms from extracted text or labels
query_terms = []
if extracted_symbols:
query_terms.extend(extracted_symbols)
# Split clean words from text
if extracted_text:
words = re.findall(r'[a-zA-Z0-9\u0370-\u03FF\u1F00-\u1FFF\u4E00-\u9FFF]+', extracted_text)
query_terms.extend(words[:15]) # Cap to prevent excessive token use
grounding_context = self.retrieve_grounding_context(query_terms, max_results=6)
if not grounding_context:
return base_system_prompt
enriched_prompt = (
f"{base_system_prompt}\n\n"
f"Here is some verified historical and paleographical grounding information that you MUST use "
f"in your analysis. Cite the specific sources (e.g. Gardiner, Cappelli, Thompson, Borger) "
f"whenever discussing these symbols:\n\n"
f"{grounding_context}\n\n"
f"In your final output, append a short section titled 'References and Citations' detailing the relevant sources used."
)
return enriched_prompt
|