Spaces:

Akshay30
/

decipherai-api

Sleeping

File size: 8,103 Bytes

2f4af3f

import json
import os
import re
from typing import Dict, List, Tuple
from config import Config

class RAGService:
    def __init__(self, references_path: str = None):
        self.config = Config()
        self.references_path = references_path or str(self.config.REFERENCES_PATH)
        self.corpus = []
        self.load_corpus()

    def load_corpus(self):
        """Load and index the historical reference document corpus"""
        try:
            if os.path.exists(self.references_path):
                with open(self.references_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                
                # Index Egyptian
                for term, note in data.get("egypt_symbol_notes", {}).items():
                    self.corpus.append({
                        "category": "Egyptian Hieroglyphic Sign",
                        "term": term,
                        "definition": note,
                        "citation": "Gardiner, A. H. (1957). Egyptian Grammar: Being an Introduction to the Study of Hieroglyphs. Oxford: Griffith Institute."
                    })
                
                # Index Greek
                for term, note in data.get("greek_symbol_notes", {}).items():
                    self.corpus.append({
                        "category": "Greek Paleography Mark",
                        "term": term,
                        "definition": note,
                        "citation": "Thompson, E. M. (1912). An Introduction to Greek and Latin Palaeography. Oxford: Clarendon Press."
                    })
                
                # Index Latin
                for term, note in data.get("latin_symbol_notes", {}).items():
                    self.corpus.append({
                        "category": "Latin Scribal Abbreviation",
                        "term": term,
                        "definition": note,
                        "citation": "Cappelli, A. (1928). Dizionario di Abbreviature Latine ed Italiane. Milan: Hoepli."
                    })
                
                # Index Cuneiform
                for term, note in data.get("cuneiform_symbol_notes", {}).items():
                    self.corpus.append({
                        "category": "Mesopotamian Cuneiform Logogram",
                        "term": term,
                        "definition": note,
                        "citation": "Borger, R. (2004). Mesopotamisches Zeichenlexikon. Münster: Ugarit-Verlag."
                    })
                
                print(f"[INFO] RAG Service successfully indexed {len(self.corpus)} reference records.")
            else:
                print(f"[WARN] Reference path {self.references_path} not found for RAG index.")
        except Exception as e:
            print(f"[ERROR] Failed to initialize RAG index: {e}")

    def retrieve_grounding_context(self, query_terms: List[str], max_results: int = 5) -> str:
        """Search reference records and build a grounding context string with academic citations"""
        if not query_terms or not self.corpus:
            return ""

        matches = []
        seen = set()

        for term in query_terms:
            if not term or len(term.strip()) < 1:
                continue
            
            clean_term = term.lower().strip()
            
            # Simple keyword search with scoring
            for record in self.corpus:
                score = 0
                record_term = record["term"].lower()
                record_def = record["definition"].lower()
                
                if clean_term == record_term:
                    score += 10
                else:
                    # Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
                    term_parts = re.split(r'[_ \-]', record_term)
                    if clean_term in term_parts:
                        score += 5
                    elif len(clean_term) > 3:
                        if clean_term in record_term:
                            score += 5
                        elif clean_term in record_def:
                            score += 2
                
                if score > 0:
                    record_key = f"{record['category']}:{record['term']}"
                    if record_key not in seen:
                        seen.add(record_key)
                        matches.append((score, record))

        # Sort matches by relevance score
        matches.sort(key=lambda x: x[0], reverse=True)
        top_matches = [m[1] for m in matches[:max_results]]

        if not top_matches:
            return ""

        context_lines = ["### Scholarly Grounding and Sign References:"]
        for idx, match in enumerate(top_matches, 1):
            context_lines.append(
                f"{idx}. **[{match['category']}] '{match['term']}'**: {match['definition']}\n"
                f"   *Source Citation:* {match['citation']}"
            )
        
        return "\n".join(context_lines)

    def retrieve_grounding_list(self, query_terms: List[str], max_results: int = 5) -> List[Dict]:
        """Search reference records and return the raw list of matching reference dicts with citations"""
        if not query_terms or not self.corpus:
            return []

        matches = []
        seen = set()

        for term in query_terms:
            if not term or len(term.strip()) < 1:
                continue
            
            clean_term = term.lower().strip()
            
            # Simple keyword search with scoring
            for record in self.corpus:
                score = 0
                record_term = record["term"].lower()
                record_def = record["definition"].lower()
                
                if clean_term == record_term:
                    score += 10
                else:
                    # Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
                    term_parts = re.split(r'[_ \-]', record_term)
                    if clean_term in term_parts:
                        score += 5
                    elif len(clean_term) > 3:
                        if clean_term in record_term:
                            score += 5
                        elif clean_term in record_def:
                            score += 2
                
                if score > 0:
                    record_key = f"{record['category']}:{record['term']}"
                    if record_key not in seen:
                        seen.add(record_key)
                        matches.append((score, record))

        # Sort matches by relevance score
        matches.sort(key=lambda x: x[0], reverse=True)
        return [m[1] for m in matches[:max_results]]

    def enrich_prompt(self, base_system_prompt: str, extracted_text: str, extracted_symbols: List[str] = None) -> str:
        """Enrich LLM prompts with RAG context and citation grounding instructions"""
        # Parse query terms from extracted text or labels
        query_terms = []
        if extracted_symbols:
            query_terms.extend(extracted_symbols)
        
        # Split clean words from text
        if extracted_text:
            words = re.findall(r'[a-zA-Z0-9\u0370-\u03FF\u1F00-\u1FFF\u4E00-\u9FFF]+', extracted_text)
            query_terms.extend(words[:15])  # Cap to prevent excessive token use
            
        grounding_context = self.retrieve_grounding_context(query_terms, max_results=6)
        
        if not grounding_context:
            return base_system_prompt

        enriched_prompt = (
            f"{base_system_prompt}\n\n"
            f"Here is some verified historical and paleographical grounding information that you MUST use "
            f"in your analysis. Cite the specific sources (e.g. Gardiner, Cappelli, Thompson, Borger) "
            f"whenever discussing these symbols:\n\n"
            f"{grounding_context}\n\n"
            f"In your final output, append a short section titled 'References and Citations' detailing the relevant sources used."
        )
        return enriched_prompt