Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import re | |
| from typing import Dict, List, Tuple | |
| from config import Config | |
| class RAGService: | |
| def __init__(self, references_path: str = None): | |
| self.config = Config() | |
| self.references_path = references_path or str(self.config.REFERENCES_PATH) | |
| self.corpus = [] | |
| self.load_corpus() | |
| def load_corpus(self): | |
| """Load and index the historical reference document corpus""" | |
| try: | |
| if os.path.exists(self.references_path): | |
| with open(self.references_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| # Index Egyptian | |
| for term, note in data.get("egypt_symbol_notes", {}).items(): | |
| self.corpus.append({ | |
| "category": "Egyptian Hieroglyphic Sign", | |
| "term": term, | |
| "definition": note, | |
| "citation": "Gardiner, A. H. (1957). Egyptian Grammar: Being an Introduction to the Study of Hieroglyphs. Oxford: Griffith Institute." | |
| }) | |
| # Index Greek | |
| for term, note in data.get("greek_symbol_notes", {}).items(): | |
| self.corpus.append({ | |
| "category": "Greek Paleography Mark", | |
| "term": term, | |
| "definition": note, | |
| "citation": "Thompson, E. M. (1912). An Introduction to Greek and Latin Palaeography. Oxford: Clarendon Press." | |
| }) | |
| # Index Latin | |
| for term, note in data.get("latin_symbol_notes", {}).items(): | |
| self.corpus.append({ | |
| "category": "Latin Scribal Abbreviation", | |
| "term": term, | |
| "definition": note, | |
| "citation": "Cappelli, A. (1928). Dizionario di Abbreviature Latine ed Italiane. Milan: Hoepli." | |
| }) | |
| # Index Cuneiform | |
| for term, note in data.get("cuneiform_symbol_notes", {}).items(): | |
| self.corpus.append({ | |
| "category": "Mesopotamian Cuneiform Logogram", | |
| "term": term, | |
| "definition": note, | |
| "citation": "Borger, R. (2004). Mesopotamisches Zeichenlexikon. Münster: Ugarit-Verlag." | |
| }) | |
| print(f"[INFO] RAG Service successfully indexed {len(self.corpus)} reference records.") | |
| else: | |
| print(f"[WARN] Reference path {self.references_path} not found for RAG index.") | |
| except Exception as e: | |
| print(f"[ERROR] Failed to initialize RAG index: {e}") | |
| def retrieve_grounding_context(self, query_terms: List[str], max_results: int = 5) -> str: | |
| """Search reference records and build a grounding context string with academic citations""" | |
| if not query_terms or not self.corpus: | |
| return "" | |
| matches = [] | |
| seen = set() | |
| for term in query_terms: | |
| if not term or len(term.strip()) < 1: | |
| continue | |
| clean_term = term.lower().strip() | |
| # Simple keyword search with scoring | |
| for record in self.corpus: | |
| score = 0 | |
| record_term = record["term"].lower() | |
| record_def = record["definition"].lower() | |
| if clean_term == record_term: | |
| score += 10 | |
| else: | |
| # Check if term matches as a whole word part (e.g. "woman" in "woman_seated") | |
| term_parts = re.split(r'[_ \-]', record_term) | |
| if clean_term in term_parts: | |
| score += 5 | |
| elif len(clean_term) > 3: | |
| if clean_term in record_term: | |
| score += 5 | |
| elif clean_term in record_def: | |
| score += 2 | |
| if score > 0: | |
| record_key = f"{record['category']}:{record['term']}" | |
| if record_key not in seen: | |
| seen.add(record_key) | |
| matches.append((score, record)) | |
| # Sort matches by relevance score | |
| matches.sort(key=lambda x: x[0], reverse=True) | |
| top_matches = [m[1] for m in matches[:max_results]] | |
| if not top_matches: | |
| return "" | |
| context_lines = ["### Scholarly Grounding and Sign References:"] | |
| for idx, match in enumerate(top_matches, 1): | |
| context_lines.append( | |
| f"{idx}. **[{match['category']}] '{match['term']}'**: {match['definition']}\n" | |
| f" *Source Citation:* {match['citation']}" | |
| ) | |
| return "\n".join(context_lines) | |
| def retrieve_grounding_list(self, query_terms: List[str], max_results: int = 5) -> List[Dict]: | |
| """Search reference records and return the raw list of matching reference dicts with citations""" | |
| if not query_terms or not self.corpus: | |
| return [] | |
| matches = [] | |
| seen = set() | |
| for term in query_terms: | |
| if not term or len(term.strip()) < 1: | |
| continue | |
| clean_term = term.lower().strip() | |
| # Simple keyword search with scoring | |
| for record in self.corpus: | |
| score = 0 | |
| record_term = record["term"].lower() | |
| record_def = record["definition"].lower() | |
| if clean_term == record_term: | |
| score += 10 | |
| else: | |
| # Check if term matches as a whole word part (e.g. "woman" in "woman_seated") | |
| term_parts = re.split(r'[_ \-]', record_term) | |
| if clean_term in term_parts: | |
| score += 5 | |
| elif len(clean_term) > 3: | |
| if clean_term in record_term: | |
| score += 5 | |
| elif clean_term in record_def: | |
| score += 2 | |
| if score > 0: | |
| record_key = f"{record['category']}:{record['term']}" | |
| if record_key not in seen: | |
| seen.add(record_key) | |
| matches.append((score, record)) | |
| # Sort matches by relevance score | |
| matches.sort(key=lambda x: x[0], reverse=True) | |
| return [m[1] for m in matches[:max_results]] | |
| def enrich_prompt(self, base_system_prompt: str, extracted_text: str, extracted_symbols: List[str] = None) -> str: | |
| """Enrich LLM prompts with RAG context and citation grounding instructions""" | |
| # Parse query terms from extracted text or labels | |
| query_terms = [] | |
| if extracted_symbols: | |
| query_terms.extend(extracted_symbols) | |
| # Split clean words from text | |
| if extracted_text: | |
| words = re.findall(r'[a-zA-Z0-9\u0370-\u03FF\u1F00-\u1FFF\u4E00-\u9FFF]+', extracted_text) | |
| query_terms.extend(words[:15]) # Cap to prevent excessive token use | |
| grounding_context = self.retrieve_grounding_context(query_terms, max_results=6) | |
| if not grounding_context: | |
| return base_system_prompt | |
| enriched_prompt = ( | |
| f"{base_system_prompt}\n\n" | |
| f"Here is some verified historical and paleographical grounding information that you MUST use " | |
| f"in your analysis. Cite the specific sources (e.g. Gardiner, Cappelli, Thompson, Borger) " | |
| f"whenever discussing these symbols:\n\n" | |
| f"{grounding_context}\n\n" | |
| f"In your final output, append a short section titled 'References and Citations' detailing the relevant sources used." | |
| ) | |
| return enriched_prompt | |