gurusahaay / app /services /concept_resolver.py
sonikan's picture
add ai improvement
c4db906
"""
Concept Resolution Service - THE CORE OF MULTILINGUAL SEARCH
This service implements the critical concept-based search logic:
1. Detect language of input
2. Normalize text (strip suffixes)
3. Match against CONCEPT_SYNONYMS table
4. If no match, use Gemini to translate and retry
5. Return canonical concept_id
CRITICAL RULES:
- NEVER search raw Kannada/Hindi text
- ALWAYS resolve to concept_id first
- All content queries use concept_id
"""
import re
from typing import Optional, Tuple, List
from sqlalchemy.orm import Session
from sqlalchemy import func
from app.models.concept import Concept, ConceptSynonym
from app.services.gemini_service import GeminiService
def levenshtein_distance(s1: str, s2: str) -> int:
"""Calculate Levenshtein distance between two strings."""
if len(s1) < len(s2):
return levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def similarity_ratio(s1: str, s2: str) -> float:
"""Calculate similarity ratio between two strings (0-1)."""
distance = levenshtein_distance(s1.lower(), s2.lower())
max_len = max(len(s1), len(s2))
if max_len == 0:
return 1.0
return 1.0 - (distance / max_len)
class ConceptResolver:
"""
Resolves user input (in any language) to a canonical concept_id.
This enables multilingual search without vector embeddings.
"""
# Common Kannada suffixes to strip for normalization
KANNADA_SUFFIXES = [
"ಗಳು", # plural
"ಕ್ರಿಯೆ", # action/process
"ವನ್ನು", # accusative
"ದಲ್ಲಿ", # locative
"ಯಲ್ಲಿ", # locative variant
"ಯಿಂದ", # instrumental
"ಗೆ", # dative
"ಯ", # genitive
]
# Common Hindi suffixes to strip
HINDI_SUFFIXES = [
"ों", # plural oblique
"ें", # plural
"ियाँ", # feminine plural
"ियों", # feminine plural oblique
"ना", # infinitive
"ने", # oblique infinitive
"में", # locative
"से", # instrumental
"को", # dative
"का", # genitive masculine
"की", # genitive feminine
"के", # genitive plural
]
# Common English stop words to filter out
ENGLISH_STOP_WORDS = {
"i", "me", "my", "we", "our", "you", "your", "he", "she", "it", "they",
"what", "which", "who", "whom", "this", "that", "these", "those",
"am", "is", "are", "was", "were", "be", "been", "being",
"have", "has", "had", "having", "do", "does", "did", "doing",
"a", "an", "the", "and", "but", "if", "or", "because", "as", "until",
"while", "of", "at", "by", "for", "with", "about", "against", "between",
"into", "through", "during", "before", "after", "above", "below", "to",
"from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
"further", "then", "once", "here", "there", "when", "where", "why", "how",
"all", "each", "few", "more", "most", "other", "some", "such", "no", "nor",
"not", "only", "own", "same", "so", "than", "too", "very", "can", "will",
"just", "should", "now", "want", "need", "help", "understand", "learn",
"teach", "explain", "know", "tell", "show", "please", "could", "would"
}
@staticmethod
def extract_keywords(text: str) -> List[str]:
"""
Extract meaningful keywords from English text by removing stop words.
Returns list of keywords sorted by length (longer = more specific).
"""
words = text.lower().split()
keywords = [w for w in words if w not in ConceptResolver.ENGLISH_STOP_WORDS and len(w) > 2]
# Sort by length descending (longer words are usually more specific)
return sorted(keywords, key=len, reverse=True)
@staticmethod
def detect_language(text: str) -> str:
"""
Detect the language of input text.
Uses Unicode ranges for script detection.
Returns: 'kn' (Kannada), 'hi' (Hindi), or 'en' (English/default)
"""
# Kannada Unicode range: U+0C80 to U+0CFF
kannada_pattern = re.compile(r'[\u0C80-\u0CFF]')
# Devanagari (Hindi) Unicode range: U+0900 to U+097F
hindi_pattern = re.compile(r'[\u0900-\u097F]')
kannada_count = len(kannada_pattern.findall(text))
hindi_count = len(hindi_pattern.findall(text))
if kannada_count > hindi_count and kannada_count > 0:
return "kn"
elif hindi_count > 0:
return "hi"
return "en"
@staticmethod
def normalize_text(text: str, language: str) -> str:
"""
Normalize text by:
1. Converting to lowercase (for English)
2. Stripping common suffixes
3. Removing extra whitespace
This improves matching against concept synonyms.
"""
text = text.strip()
if language == "en":
text = text.lower()
elif language == "kn":
for suffix in ConceptResolver.KANNADA_SUFFIXES:
if text.endswith(suffix):
text = text[:-len(suffix)]
break
elif language == "hi":
for suffix in ConceptResolver.HINDI_SUFFIXES:
if text.endswith(suffix):
text = text[:-len(suffix)]
break
# Remove extra whitespace
text = " ".join(text.split())
return text
@staticmethod
def resolve_concept(db: Session, text: str, speech_language: Optional[str] = None) -> Tuple[Optional[str], str, str]:
"""
Main resolution function.
Args:
db: Database session
text: User input in any language
speech_language: Optional language detected by speech engine
Returns:
Tuple of (concept_id, detected_language, normalized_text)
concept_id is None if no match found
"""
# Step 1: Detect language
language = ConceptResolver.detect_language(text)
# If script detection says "en" but speech engine detected "kn" or "hi",
# it's likely transliterated text (Hinglish/Kanglish).
# In this case, trust the speech engine.
if language == "en" and speech_language in ["kn", "hi"]:
language = speech_language
print(f"[ConceptResolver] Overriding script language 'en' with speech language '{language}' (Transliteration detected)")
print(f"[ConceptResolver] Input: '{text}' | Language: {language}")
# Step 2: Normalize text
normalized = ConceptResolver.normalize_text(text, language)
print(f"[ConceptResolver] Normalized: '{normalized}'")
# Step 3: Try exact match first (case-insensitive for English)
if language == "en":
synonym = db.query(ConceptSynonym).filter(
func.lower(ConceptSynonym.term) == normalized.lower()
).first()
else:
# For Kannada/Hindi, try exact match
synonym = db.query(ConceptSynonym).filter(
ConceptSynonym.term == normalized
).first()
if synonym:
print(f"[ConceptResolver] ✓ Step 3: EXACT MATCH found -> {synonym.concept_id}")
return (synonym.concept_id, language, normalized)
print(f"[ConceptResolver] Step 3: Exact match - no match")
# Step 4: Try partial match (contains)
if language == "en":
synonym = db.query(ConceptSynonym).filter(
func.lower(ConceptSynonym.term).contains(normalized.lower())
).first()
else:
synonym = db.query(ConceptSynonym).filter(
ConceptSynonym.term.contains(normalized)
).first()
if synonym:
print(f"[ConceptResolver] ✓ Step 4: PARTIAL MATCH found -> {synonym.concept_id}")
return (synonym.concept_id, language, normalized)
print(f"[ConceptResolver] Step 4: Partial match - no match")
# Step 4.5: For English, extract keywords and try matching each
if language == "en":
keywords = ConceptResolver.extract_keywords(normalized)
print(f"[ConceptResolver] Step 5: Keywords extracted: {keywords}")
for keyword in keywords:
# Try exact match with keyword
synonym = db.query(ConceptSynonym).filter(
func.lower(ConceptSynonym.term) == keyword.lower()
).first()
if synonym:
print(f"[ConceptResolver] ✓ Step 5: KEYWORD MATCH '{keyword}' -> {synonym.concept_id}")
return (synonym.concept_id, language, normalized)
# Try if synonym contains keyword
synonym = db.query(ConceptSynonym).filter(
func.lower(ConceptSynonym.term).contains(keyword.lower())
).first()
if synonym:
print(f"[ConceptResolver] ✓ Step 5: KEYWORD PARTIAL MATCH '{keyword}' -> {synonym.concept_id}")
return (synonym.concept_id, language, normalized)
print(f"[ConceptResolver] Step 5: Keyword match - no match")
# Step 5: Try matching normalized text against any synonym
# This handles cases where user types partial term
synonyms = db.query(ConceptSynonym).filter(
ConceptSynonym.language == language
).all()
for syn in synonyms:
syn_normalized = ConceptResolver.normalize_text(syn.term, language)
if language == "en":
if normalized.lower() in syn_normalized.lower() or syn_normalized.lower() in normalized.lower():
print(f"[ConceptResolver] ✓ Step 6: SYNONYM SCAN MATCH '{syn.term}' -> {syn.concept_id}")
return (syn.concept_id, language, normalized)
else:
if normalized in syn_normalized or syn_normalized in normalized:
print(f"[ConceptResolver] ✓ Step 6: SYNONYM SCAN MATCH '{syn.term}' -> {syn.concept_id}")
return (syn.concept_id, language, normalized)
print(f"[ConceptResolver] Step 6: Synonym scan - no match")
# Step 6: Fuzzy matching for typos (English only)
# Use Levenshtein distance to find close matches
if language == "en" and len(normalized) >= 4:
best_match = None
best_score = 0.0
threshold = 0.7 # 70% similarity required
all_synonyms = db.query(ConceptSynonym).filter(
ConceptSynonym.language == "en"
).all()
for syn in all_synonyms:
score = similarity_ratio(normalized, syn.term)
if score > best_score and score >= threshold:
best_score = score
best_match = syn
if best_match:
print(f"[ConceptResolver] ✓ Step 7: FUZZY MATCH (score: {best_score:.2f}) '{best_match.term}' -> {best_match.concept_id}")
return (best_match.concept_id, language, normalized)
print(f"[ConceptResolver] Step 7: Fuzzy match - no match")
# Step 8: If no match, try Gemini AI to find the best topic
if GeminiService.is_available():
try:
# Get all existing topics for Gemini to match against
all_concepts = db.query(Concept).all()
existing_topics = [
{"id": c.concept_id, "name": c.description_en or c.concept_id.replace("_", " ").title()}
for c in all_concepts
]
# Use smart search to find the best matching topic
search_result = GeminiService.smart_search(text, existing_topics)
if search_result.get("matched_topics") and len(search_result["matched_topics"]) > 0:
best_topic = search_result["matched_topics"][0]
# Require 0.95+ relevance for a good match (must be very confident)
if best_topic.get("id") and best_topic.get("relevance", 0) >= 0.95:
print(f"[ConceptResolver] ✓ Step 8: GEMINI SMART SEARCH (relevance: {best_topic.get('relevance')}) -> {best_topic['id']}")
return (best_topic["id"], language, normalized)
else:
print(f"[ConceptResolver] Step 8: Gemini match rejected (relevance {best_topic.get('relevance', 0)} < 0.95)")
# No good match - try to create a new concept
print(f"[ConceptResolver] Step 9: Creating new concept...")
topic_suggestion = GeminiService.suggest_topic(text, "", existing_topics)
if topic_suggestion.get("suggested_new_topic_id") and topic_suggestion.get("suggested_new_topic"):
new_concept_id = topic_suggestion["suggested_new_topic_id"].upper()
new_concept_name = topic_suggestion["suggested_new_topic"]
# Check if concept already exists
existing = db.query(Concept).filter(Concept.concept_id == new_concept_id).first()
if not existing:
# Create new concept
new_concept = Concept(
concept_id=new_concept_id,
description_en=new_concept_name,
description_hi=topic_suggestion.get("suggested_new_topic_hi"),
description_kn=topic_suggestion.get("suggested_new_topic_kn"),
subject="General",
grade="1-12"
)
db.add(new_concept)
# Add synonyms
import uuid
synonyms_to_add = []
# Add English synonyms
for syn in topic_suggestion.get("synonyms_en", [new_concept_name.lower()]):
synonyms_to_add.append(ConceptSynonym(
id=uuid.uuid4(),
concept_id=new_concept_id,
language="en",
term=syn.lower()
))
# Add Hindi synonyms
for syn in topic_suggestion.get("synonyms_hi", []):
synonyms_to_add.append(ConceptSynonym(
id=uuid.uuid4(),
concept_id=new_concept_id,
language="hi",
term=syn
))
# Add Kannada synonyms
for syn in topic_suggestion.get("synonyms_kn", []):
synonyms_to_add.append(ConceptSynonym(
id=uuid.uuid4(),
concept_id=new_concept_id,
language="kn",
term=syn
))
for syn in synonyms_to_add:
db.add(syn)
db.commit()
print(f"[ConceptResolver] ✓ Step 9: CREATED NEW CONCEPT '{new_concept_id}' ({new_concept_name})")
return (new_concept_id, language, normalized)
else:
print(f"[ConceptResolver] ✓ Step 9: Concept '{new_concept_id}' already exists")
return (new_concept_id, language, normalized)
print(f"[ConceptResolver] Step 9: Could not create new concept")
# Fallback: If non-English, try translation approach
if language != "en":
translated_text, _ = GeminiService.translate_text(text, target_language="en")
if translated_text and translated_text != text:
translated_normalized = ConceptResolver.normalize_text(translated_text, "en")
# Try exact match with translated text
synonym = db.query(ConceptSynonym).filter(
func.lower(ConceptSynonym.term) == translated_normalized.lower()
).first()
if synonym:
print(f"[ConceptResolver] ✓ Step 8: GEMINI TRANSLATION EXACT MATCH '{translated_normalized}' -> {synonym.concept_id}")
return (synonym.concept_id, language, normalized)
# Try partial match with translated text
synonym = db.query(ConceptSynonym).filter(
func.lower(ConceptSynonym.term).contains(translated_normalized.lower())
).first()
if synonym:
print(f"[ConceptResolver] ✓ Step 8: GEMINI TRANSLATION PARTIAL MATCH '{translated_normalized}' -> {synonym.concept_id}")
return (synonym.concept_id, language, normalized)
print(f"[ConceptResolver] Step 8: Gemini translation - no match")
except Exception as e:
print(f"[ConceptResolver] Step 8: Gemini AI error: {e}")
# No match found
print(f"[ConceptResolver] ✗ NO MATCH FOUND for '{text}'")
return (None, language, normalized)
@staticmethod
def get_concept_by_id(db: Session, concept_id: str) -> Optional[Concept]:
"""Get a concept by its ID."""
return db.query(Concept).filter(Concept.concept_id == concept_id).first()
@staticmethod
def get_all_concepts(db: Session, language: Optional[str] = None) -> List[Concept]:
"""
Get all concepts, optionally filtered by language.
If language is specified, only returns concepts that have synonyms in that language.
"""
if language:
concept_ids = db.query(ConceptSynonym.concept_id).filter(
ConceptSynonym.language == language
).distinct().all()
concept_ids = [c[0] for c in concept_ids]
return db.query(Concept).filter(Concept.concept_id.in_(concept_ids)).all()
return db.query(Concept).all()
@staticmethod
def get_synonyms_for_concept(db: Session, concept_id: str, language: Optional[str] = None) -> List[ConceptSynonym]:
"""Get all synonyms for a concept, optionally filtered by language."""
query = db.query(ConceptSynonym).filter(ConceptSynonym.concept_id == concept_id)
if language:
query = query.filter(ConceptSynonym.language == language)
return query.all()