Spaces:
Running
Running
File size: 3,373 Bytes
dc72b20 5cc15da dc72b20 5cc15da dc72b20 5cc15da dc72b20 5cc15da dc72b20 5cc15da dc72b20 5cc15da dc72b20 5cc15da dc72b20 5cc15da dc72b20 5cc15da dc72b20 5cc15da dc72b20 5cc15da dc72b20 5cc15da dc72b20 5cc15da dc72b20 5cc15da | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | # src/rag_manager.py
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
class SociolinguisticRAG:
def __init__(self):
print("🧠 Initializing Local Embedding Model (all-MiniLM-L6-v2)...")
# Downloads tiny ~80MB model to your server once
self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
self.embedding_dim = self.encoder.get_sentence_embedding_dimension()
# Initialize an empty in-memory FAISS index (L2 distance / Cosine Similarity)
self.index = faiss.IndexFlatL2(self.embedding_dim)
self.chunks = []
def load_persona_rules(self, dialect_name: str, rules_list: list):
"""
Vectorizes and loads a list of sociolinguistic rules into RAM.
rules_list should be a list of strings (e.g., ["Rule 1...", "Rule 2..."])
"""
if not rules_list:
return
print(f"📚 Vectorizing {len(rules_list)} rules for {dialect_name}...")
# Store text chunks
self.chunks = rules_list
# Convert text rules to mathematical vectors
embeddings = self.encoder.encode(self.chunks)
# FAISS requires float32 numpy arrays
embeddings = np.array(embeddings).astype('float32')
# Add to the in-memory database
self.index.add(embeddings)
print(f"✅ {dialect_name} successfully vectorized and indexed in RAM.")
def retrieve_context(self, user_transcription: str, k=3) -> str:
"""
Takes the Whisper output and finds the top 'k' most relevant cultural rules.
"""
if not self.chunks:
return "No specific rules loaded for this dialect."
# 1. Convert user's spoken sentence into a vector
query_vector = self.encoder.encode([user_transcription])
query_vector = np.array(query_vector).astype('float32')
# 2. Search FAISS for the closest mathematical matches
distances, indices = self.index.search(query_vector, k)
# 3. Retrieve the actual text for those rules (with safety check)
retrieved_rules = [self.chunks[i] for i in indices[0] if i < len(self.chunks)]
# 4. Format them into a neat string for Gemini
context_string = "\n".join([f"- {rule}" for rule in retrieved_rules])
return context_string
# ==========================================
# EXAMPLE USAGE / LOCAL TESTING
# ==========================================
if __name__ == "__main__":
# 1. Your raw JSON data broken into a Python list
nigerian_pidgin_rules = [
"Lexicon: 'Wahala' means trouble or problem.",
"Lexicon: 'How far' is a greeting meaning 'How are you' or 'What's going on'.",
"Pragmatics: Repeating a word (e.g., 'now now') emphasizes extreme urgency.",
"Syntax: Pluralization is often done by adding 'dem' after a noun."
]
# 2. Initialize and load (Happens on App Startup)
rag = SociolinguisticRAG()
rag.load_persona_rules("Nigerian Pidgin", nigerian_pidgin_rules)
# 3. Retrieve (Happens when the user speaks)
whisper_text = "What's going on now?"
print(f"\n🗣️ User said: {whisper_text}")
relevant_context = rag.retrieve_context(whisper_text, k=2)
print(f"🎯 Retrieved Context for Gemini:\n{relevant_context}") |