decipherai-api / services /rag_service.py
Akshay30's picture
Initial DecipherAI backend deployment
2f4af3f
import json
import os
import re
from typing import Dict, List, Tuple
from config import Config
class RAGService:
def __init__(self, references_path: str = None):
self.config = Config()
self.references_path = references_path or str(self.config.REFERENCES_PATH)
self.corpus = []
self.load_corpus()
def load_corpus(self):
"""Load and index the historical reference document corpus"""
try:
if os.path.exists(self.references_path):
with open(self.references_path, "r", encoding="utf-8") as f:
data = json.load(f)
# Index Egyptian
for term, note in data.get("egypt_symbol_notes", {}).items():
self.corpus.append({
"category": "Egyptian Hieroglyphic Sign",
"term": term,
"definition": note,
"citation": "Gardiner, A. H. (1957). Egyptian Grammar: Being an Introduction to the Study of Hieroglyphs. Oxford: Griffith Institute."
})
# Index Greek
for term, note in data.get("greek_symbol_notes", {}).items():
self.corpus.append({
"category": "Greek Paleography Mark",
"term": term,
"definition": note,
"citation": "Thompson, E. M. (1912). An Introduction to Greek and Latin Palaeography. Oxford: Clarendon Press."
})
# Index Latin
for term, note in data.get("latin_symbol_notes", {}).items():
self.corpus.append({
"category": "Latin Scribal Abbreviation",
"term": term,
"definition": note,
"citation": "Cappelli, A. (1928). Dizionario di Abbreviature Latine ed Italiane. Milan: Hoepli."
})
# Index Cuneiform
for term, note in data.get("cuneiform_symbol_notes", {}).items():
self.corpus.append({
"category": "Mesopotamian Cuneiform Logogram",
"term": term,
"definition": note,
"citation": "Borger, R. (2004). Mesopotamisches Zeichenlexikon. Münster: Ugarit-Verlag."
})
print(f"[INFO] RAG Service successfully indexed {len(self.corpus)} reference records.")
else:
print(f"[WARN] Reference path {self.references_path} not found for RAG index.")
except Exception as e:
print(f"[ERROR] Failed to initialize RAG index: {e}")
def retrieve_grounding_context(self, query_terms: List[str], max_results: int = 5) -> str:
"""Search reference records and build a grounding context string with academic citations"""
if not query_terms or not self.corpus:
return ""
matches = []
seen = set()
for term in query_terms:
if not term or len(term.strip()) < 1:
continue
clean_term = term.lower().strip()
# Simple keyword search with scoring
for record in self.corpus:
score = 0
record_term = record["term"].lower()
record_def = record["definition"].lower()
if clean_term == record_term:
score += 10
else:
# Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
term_parts = re.split(r'[_ \-]', record_term)
if clean_term in term_parts:
score += 5
elif len(clean_term) > 3:
if clean_term in record_term:
score += 5
elif clean_term in record_def:
score += 2
if score > 0:
record_key = f"{record['category']}:{record['term']}"
if record_key not in seen:
seen.add(record_key)
matches.append((score, record))
# Sort matches by relevance score
matches.sort(key=lambda x: x[0], reverse=True)
top_matches = [m[1] for m in matches[:max_results]]
if not top_matches:
return ""
context_lines = ["### Scholarly Grounding and Sign References:"]
for idx, match in enumerate(top_matches, 1):
context_lines.append(
f"{idx}. **[{match['category']}] '{match['term']}'**: {match['definition']}\n"
f" *Source Citation:* {match['citation']}"
)
return "\n".join(context_lines)
def retrieve_grounding_list(self, query_terms: List[str], max_results: int = 5) -> List[Dict]:
"""Search reference records and return the raw list of matching reference dicts with citations"""
if not query_terms or not self.corpus:
return []
matches = []
seen = set()
for term in query_terms:
if not term or len(term.strip()) < 1:
continue
clean_term = term.lower().strip()
# Simple keyword search with scoring
for record in self.corpus:
score = 0
record_term = record["term"].lower()
record_def = record["definition"].lower()
if clean_term == record_term:
score += 10
else:
# Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
term_parts = re.split(r'[_ \-]', record_term)
if clean_term in term_parts:
score += 5
elif len(clean_term) > 3:
if clean_term in record_term:
score += 5
elif clean_term in record_def:
score += 2
if score > 0:
record_key = f"{record['category']}:{record['term']}"
if record_key not in seen:
seen.add(record_key)
matches.append((score, record))
# Sort matches by relevance score
matches.sort(key=lambda x: x[0], reverse=True)
return [m[1] for m in matches[:max_results]]
def enrich_prompt(self, base_system_prompt: str, extracted_text: str, extracted_symbols: List[str] = None) -> str:
"""Enrich LLM prompts with RAG context and citation grounding instructions"""
# Parse query terms from extracted text or labels
query_terms = []
if extracted_symbols:
query_terms.extend(extracted_symbols)
# Split clean words from text
if extracted_text:
words = re.findall(r'[a-zA-Z0-9\u0370-\u03FF\u1F00-\u1FFF\u4E00-\u9FFF]+', extracted_text)
query_terms.extend(words[:15]) # Cap to prevent excessive token use
grounding_context = self.retrieve_grounding_context(query_terms, max_results=6)
if not grounding_context:
return base_system_prompt
enriched_prompt = (
f"{base_system_prompt}\n\n"
f"Here is some verified historical and paleographical grounding information that you MUST use "
f"in your analysis. Cite the specific sources (e.g. Gardiner, Cappelli, Thompson, Borger) "
f"whenever discussing these symbols:\n\n"
f"{grounding_context}\n\n"
f"In your final output, append a short section titled 'References and Citations' detailing the relevant sources used."
)
return enriched_prompt