import json import os import re from typing import Dict, List, Tuple from config import Config class RAGService: def __init__(self, references_path: str = None): self.config = Config() self.references_path = references_path or str(self.config.REFERENCES_PATH) self.corpus = [] self.load_corpus() def load_corpus(self): """Load and index the historical reference document corpus""" try: if os.path.exists(self.references_path): with open(self.references_path, "r", encoding="utf-8") as f: data = json.load(f) # Index Egyptian for term, note in data.get("egypt_symbol_notes", {}).items(): self.corpus.append({ "category": "Egyptian Hieroglyphic Sign", "term": term, "definition": note, "citation": "Gardiner, A. H. (1957). Egyptian Grammar: Being an Introduction to the Study of Hieroglyphs. Oxford: Griffith Institute." }) # Index Greek for term, note in data.get("greek_symbol_notes", {}).items(): self.corpus.append({ "category": "Greek Paleography Mark", "term": term, "definition": note, "citation": "Thompson, E. M. (1912). An Introduction to Greek and Latin Palaeography. Oxford: Clarendon Press." }) # Index Latin for term, note in data.get("latin_symbol_notes", {}).items(): self.corpus.append({ "category": "Latin Scribal Abbreviation", "term": term, "definition": note, "citation": "Cappelli, A. (1928). Dizionario di Abbreviature Latine ed Italiane. Milan: Hoepli." }) # Index Cuneiform for term, note in data.get("cuneiform_symbol_notes", {}).items(): self.corpus.append({ "category": "Mesopotamian Cuneiform Logogram", "term": term, "definition": note, "citation": "Borger, R. (2004). Mesopotamisches Zeichenlexikon. Münster: Ugarit-Verlag." }) print(f"[INFO] RAG Service successfully indexed {len(self.corpus)} reference records.") else: print(f"[WARN] Reference path {self.references_path} not found for RAG index.") except Exception as e: print(f"[ERROR] Failed to initialize RAG index: {e}") def retrieve_grounding_context(self, query_terms: List[str], max_results: int = 5) -> str: """Search reference records and build a grounding context string with academic citations""" if not query_terms or not self.corpus: return "" matches = [] seen = set() for term in query_terms: if not term or len(term.strip()) < 1: continue clean_term = term.lower().strip() # Simple keyword search with scoring for record in self.corpus: score = 0 record_term = record["term"].lower() record_def = record["definition"].lower() if clean_term == record_term: score += 10 else: # Check if term matches as a whole word part (e.g. "woman" in "woman_seated") term_parts = re.split(r'[_ \-]', record_term) if clean_term in term_parts: score += 5 elif len(clean_term) > 3: if clean_term in record_term: score += 5 elif clean_term in record_def: score += 2 if score > 0: record_key = f"{record['category']}:{record['term']}" if record_key not in seen: seen.add(record_key) matches.append((score, record)) # Sort matches by relevance score matches.sort(key=lambda x: x[0], reverse=True) top_matches = [m[1] for m in matches[:max_results]] if not top_matches: return "" context_lines = ["### Scholarly Grounding and Sign References:"] for idx, match in enumerate(top_matches, 1): context_lines.append( f"{idx}. **[{match['category']}] '{match['term']}'**: {match['definition']}\n" f" *Source Citation:* {match['citation']}" ) return "\n".join(context_lines) def retrieve_grounding_list(self, query_terms: List[str], max_results: int = 5) -> List[Dict]: """Search reference records and return the raw list of matching reference dicts with citations""" if not query_terms or not self.corpus: return [] matches = [] seen = set() for term in query_terms: if not term or len(term.strip()) < 1: continue clean_term = term.lower().strip() # Simple keyword search with scoring for record in self.corpus: score = 0 record_term = record["term"].lower() record_def = record["definition"].lower() if clean_term == record_term: score += 10 else: # Check if term matches as a whole word part (e.g. "woman" in "woman_seated") term_parts = re.split(r'[_ \-]', record_term) if clean_term in term_parts: score += 5 elif len(clean_term) > 3: if clean_term in record_term: score += 5 elif clean_term in record_def: score += 2 if score > 0: record_key = f"{record['category']}:{record['term']}" if record_key not in seen: seen.add(record_key) matches.append((score, record)) # Sort matches by relevance score matches.sort(key=lambda x: x[0], reverse=True) return [m[1] for m in matches[:max_results]] def enrich_prompt(self, base_system_prompt: str, extracted_text: str, extracted_symbols: List[str] = None) -> str: """Enrich LLM prompts with RAG context and citation grounding instructions""" # Parse query terms from extracted text or labels query_terms = [] if extracted_symbols: query_terms.extend(extracted_symbols) # Split clean words from text if extracted_text: words = re.findall(r'[a-zA-Z0-9\u0370-\u03FF\u1F00-\u1FFF\u4E00-\u9FFF]+', extracted_text) query_terms.extend(words[:15]) # Cap to prevent excessive token use grounding_context = self.retrieve_grounding_context(query_terms, max_results=6) if not grounding_context: return base_system_prompt enriched_prompt = ( f"{base_system_prompt}\n\n" f"Here is some verified historical and paleographical grounding information that you MUST use " f"in your analysis. Cite the specific sources (e.g. Gardiner, Cappelli, Thompson, Borger) " f"whenever discussing these symbols:\n\n" f"{grounding_context}\n\n" f"In your final output, append a short section titled 'References and Citations' detailing the relevant sources used." ) return enriched_prompt