Spaces:

Akshay30
/

decipherai-api

Sleeping

App Files Files Community

decipherai-api / services /rag_service.py

Akshay30

Initial DecipherAI backend deployment

2f4af3f 4 days ago

raw

history blame contribute delete

8.1 kB

	import json
	import os
	import re
	from typing import Dict, List, Tuple
	from config import Config

	class RAGService:
	def __init__(self, references_path: str = None):
	self.config = Config()
	self.references_path = references_path or str(self.config.REFERENCES_PATH)
	self.corpus = []
	self.load_corpus()

	def load_corpus(self):
	"""Load and index the historical reference document corpus"""
	try:
	if os.path.exists(self.references_path):
	with open(self.references_path, "r", encoding="utf-8") as f:
	data = json.load(f)

	# Index Egyptian
	for term, note in data.get("egypt_symbol_notes", {}).items():
	self.corpus.append({
	"category": "Egyptian Hieroglyphic Sign",
	"term": term,
	"definition": note,
	"citation": "Gardiner, A. H. (1957). Egyptian Grammar: Being an Introduction to the Study of Hieroglyphs. Oxford: Griffith Institute."
	})

	# Index Greek
	for term, note in data.get("greek_symbol_notes", {}).items():
	self.corpus.append({
	"category": "Greek Paleography Mark",
	"term": term,
	"definition": note,
	"citation": "Thompson, E. M. (1912). An Introduction to Greek and Latin Palaeography. Oxford: Clarendon Press."
	})

	# Index Latin
	for term, note in data.get("latin_symbol_notes", {}).items():
	self.corpus.append({
	"category": "Latin Scribal Abbreviation",
	"term": term,
	"definition": note,
	"citation": "Cappelli, A. (1928). Dizionario di Abbreviature Latine ed Italiane. Milan: Hoepli."
	})

	# Index Cuneiform
	for term, note in data.get("cuneiform_symbol_notes", {}).items():
	self.corpus.append({
	"category": "Mesopotamian Cuneiform Logogram",
	"term": term,
	"definition": note,
	"citation": "Borger, R. (2004). Mesopotamisches Zeichenlexikon. Münster: Ugarit-Verlag."
	})

	print(f"[INFO] RAG Service successfully indexed {len(self.corpus)} reference records.")
	else:
	print(f"[WARN] Reference path {self.references_path} not found for RAG index.")
	except Exception as e:
	print(f"[ERROR] Failed to initialize RAG index: {e}")

	def retrieve_grounding_context(self, query_terms: List[str], max_results: int = 5) -> str:
	"""Search reference records and build a grounding context string with academic citations"""
	if not query_terms or not self.corpus:
	return ""

	matches = []
	seen = set()

	for term in query_terms:
	if not term or len(term.strip()) < 1:
	continue

	clean_term = term.lower().strip()

	# Simple keyword search with scoring
	for record in self.corpus:
	score = 0
	record_term = record["term"].lower()
	record_def = record["definition"].lower()

	if clean_term == record_term:
	score += 10
	else:
	# Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
	term_parts = re.split(r'[_ \-]', record_term)
	if clean_term in term_parts:
	score += 5
	elif len(clean_term) > 3:
	if clean_term in record_term:
	score += 5
	elif clean_term in record_def:
	score += 2

	if score > 0:
	record_key = f"{record['category']}:{record['term']}"
	if record_key not in seen:
	seen.add(record_key)
	matches.append((score, record))

	# Sort matches by relevance score
	matches.sort(key=lambda x: x[0], reverse=True)
	top_matches = [m[1] for m in matches[:max_results]]

	if not top_matches:
	return ""

	context_lines = ["### Scholarly Grounding and Sign References:"]
	for idx, match in enumerate(top_matches, 1):
	context_lines.append(
	f"{idx}. [{match['category']}] '{match['term']}': {match['definition']}\n"
	f" Source Citation: {match['citation']}"
	)

	return "\n".join(context_lines)

	def retrieve_grounding_list(self, query_terms: List[str], max_results: int = 5) -> List[Dict]:
	"""Search reference records and return the raw list of matching reference dicts with citations"""
	if not query_terms or not self.corpus:
	return []

	matches = []
	seen = set()

	for term in query_terms:
	if not term or len(term.strip()) < 1:
	continue

	clean_term = term.lower().strip()

	# Simple keyword search with scoring
	for record in self.corpus:
	score = 0
	record_term = record["term"].lower()
	record_def = record["definition"].lower()

	if clean_term == record_term:
	score += 10
	else:
	# Check if term matches as a whole word part (e.g. "woman" in "woman_seated")
	term_parts = re.split(r'[_ \-]', record_term)
	if clean_term in term_parts:
	score += 5
	elif len(clean_term) > 3:
	if clean_term in record_term:
	score += 5
	elif clean_term in record_def:
	score += 2

	if score > 0:
	record_key = f"{record['category']}:{record['term']}"
	if record_key not in seen:
	seen.add(record_key)
	matches.append((score, record))

	# Sort matches by relevance score
	matches.sort(key=lambda x: x[0], reverse=True)
	return [m[1] for m in matches[:max_results]]

	def enrich_prompt(self, base_system_prompt: str, extracted_text: str, extracted_symbols: List[str] = None) -> str:
	"""Enrich LLM prompts with RAG context and citation grounding instructions"""
	# Parse query terms from extracted text or labels
	query_terms = []
	if extracted_symbols:
	query_terms.extend(extracted_symbols)

	# Split clean words from text
	if extracted_text:
	words = re.findall(r'[a-zA-Z0-9\u0370-\u03FF\u1F00-\u1FFF\u4E00-\u9FFF]+', extracted_text)
	query_terms.extend(words[:15]) # Cap to prevent excessive token use

	grounding_context = self.retrieve_grounding_context(query_terms, max_results=6)

	if not grounding_context:
	return base_system_prompt

	enriched_prompt = (
	f"{base_system_prompt}\n\n"
	f"Here is some verified historical and paleographical grounding information that you MUST use "
	f"in your analysis. Cite the specific sources (e.g. Gardiner, Cappelli, Thompson, Borger) "
	f"whenever discussing these symbols:\n\n"
	f"{grounding_context}\n\n"
	f"In your final output, append a short section titled 'References and Citations' detailing the relevant sources used."
	)
	return enriched_prompt