File size: 8,103 Bytes
2f4af3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import json
import os
import re
from typing import Dict, List, Tuple
from config import Config

class RAGService:
    def __init__(self, references_path: str = None):
        self.config = Config()
        self.references_path = references_path or str(self.config.REFERENCES_PATH)
        self.corpus = []
        self.load_corpus()

    def load_corpus(self):
        """Load and index the historical reference document corpus"""
        try:
            if os.path.exists(self.references_path):
                with open(self.references_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                
                # Index Egyptian
                for term, note in data.get("egypt_symbol_notes", {}).items():
                    self.corpus.append({
                        "category": "Egyptian Hieroglyphic Sign",
                        "term": term,
                        "definition": note,
                        "citation": "Gardiner, A. H. (1957). Egyptian Grammar: Being an Introduction to the Study of Hieroglyphs. Oxford: Griffith Institute."
                    })
                
                # Index Greek
                for term, note in data.get("greek_symbol_notes", {}).items():
                    self.corpus.append({
                        "category": "Greek Paleography Mark",
                        "term": term,
                        "definition": note,
                        "citation": "Thompson, E. M. (1912). An Introduction to Greek and Latin Palaeography. Oxford: Clarendon Press."
                    })
                
                # Index Latin
                for term, note in data.get("latin_symbol_notes", {}).items():
                    self.corpus.append({
                        "category": "Latin Scribal Abbreviation",
                        "term": term,
                        "definition": note,
                        "citation": "Cappelli, A. (1928). Dizionario di Abbreviature Latine ed Italiane. Milan: Hoepli."
                    })
                
                # Index Cuneiform
                for term, note in data.get("cuneiform_symbol_notes", {}).items():
                    self.corpus.append({
                        "category": "Mesopotamian Cuneiform Logogram",
                        "term": term,
                        "definition": note,
                        "citation": "Borger, R. (2004). Mesopotamisches Zeichenlexikon. Münster: Ugarit-Verlag."
                    })
                
                print(f"[INFO] RAG Service successfully indexed {len(self.corpus)} reference records.")
            else:
                print(f"[WARN] Reference path {self.references_path} not found for RAG index.")
        except Exception as e:
            print(f"[ERROR] Failed to initialize RAG index: {e}")

    def retrieve_grounding_context(self, query_terms: List[str], max_results: int = 5) -> str:
        """Search reference records and build a grounding context string with academic citations"""
        if not query_terms or not self.corpus:
            return ""

        matches = []
        seen = set()

        for term in query_terms:
            if not term or len(term.strip()) < 1:
                continue
            
            clean_term = term.lower().strip()
            
            # Simple keyword search with scoring
            for record in self.corpus:
                score = 0
                record_term = record["term"].lower()
                record_def = record["definition"].lower()
                
                if clean_term == record_term:
                    score += 10
                else:
                    # Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
                    term_parts = re.split(r'[_ \-]', record_term)
                    if clean_term in term_parts:
                        score += 5
                    elif len(clean_term) > 3:
                        if clean_term in record_term:
                            score += 5
                        elif clean_term in record_def:
                            score += 2
                
                if score > 0:
                    record_key = f"{record['category']}:{record['term']}"
                    if record_key not in seen:
                        seen.add(record_key)
                        matches.append((score, record))

        # Sort matches by relevance score
        matches.sort(key=lambda x: x[0], reverse=True)
        top_matches = [m[1] for m in matches[:max_results]]

        if not top_matches:
            return ""

        context_lines = ["### Scholarly Grounding and Sign References:"]
        for idx, match in enumerate(top_matches, 1):
            context_lines.append(
                f"{idx}. **[{match['category']}] '{match['term']}'**: {match['definition']}\n"
                f"   *Source Citation:* {match['citation']}"
            )
        
        return "\n".join(context_lines)

    def retrieve_grounding_list(self, query_terms: List[str], max_results: int = 5) -> List[Dict]:
        """Search reference records and return the raw list of matching reference dicts with citations"""
        if not query_terms or not self.corpus:
            return []

        matches = []
        seen = set()

        for term in query_terms:
            if not term or len(term.strip()) < 1:
                continue
            
            clean_term = term.lower().strip()
            
            # Simple keyword search with scoring
            for record in self.corpus:
                score = 0
                record_term = record["term"].lower()
                record_def = record["definition"].lower()
                
                if clean_term == record_term:
                    score += 10
                else:
                    # Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
                    term_parts = re.split(r'[_ \-]', record_term)
                    if clean_term in term_parts:
                        score += 5
                    elif len(clean_term) > 3:
                        if clean_term in record_term:
                            score += 5
                        elif clean_term in record_def:
                            score += 2
                
                if score > 0:
                    record_key = f"{record['category']}:{record['term']}"
                    if record_key not in seen:
                        seen.add(record_key)
                        matches.append((score, record))

        # Sort matches by relevance score
        matches.sort(key=lambda x: x[0], reverse=True)
        return [m[1] for m in matches[:max_results]]

    def enrich_prompt(self, base_system_prompt: str, extracted_text: str, extracted_symbols: List[str] = None) -> str:
        """Enrich LLM prompts with RAG context and citation grounding instructions"""
        # Parse query terms from extracted text or labels
        query_terms = []
        if extracted_symbols:
            query_terms.extend(extracted_symbols)
        
        # Split clean words from text
        if extracted_text:
            words = re.findall(r'[a-zA-Z0-9\u0370-\u03FF\u1F00-\u1FFF\u4E00-\u9FFF]+', extracted_text)
            query_terms.extend(words[:15])  # Cap to prevent excessive token use
            
        grounding_context = self.retrieve_grounding_context(query_terms, max_results=6)
        
        if not grounding_context:
            return base_system_prompt

        enriched_prompt = (
            f"{base_system_prompt}\n\n"
            f"Here is some verified historical and paleographical grounding information that you MUST use "
            f"in your analysis. Cite the specific sources (e.g. Gardiner, Cappelli, Thompson, Borger) "
            f"whenever discussing these symbols:\n\n"
            f"{grounding_context}\n\n"
            f"In your final output, append a short section titled 'References and Citations' detailing the relevant sources used."
        )
        return enriched_prompt