import os, glob, json, faiss, numpy as np from sentence_transformers import SentenceTransformer from transformers import pipeline from groq import Groq from src.config import * # Ensure directories exist os.makedirs(INDEX_DIR, exist_ok=True) # Initialize models embedder = SentenceTransformer(EMBEDDING_MODEL) summarizer = pipeline("summarization", model=SUMMARIZER_MODEL) client = Groq(api_key=os.getenv("GROQ_API_KEY")) # --- Token Counter --- try: import tiktoken enc = tiktoken.encoding_for_model("gpt-3.5-turbo") def count_tokens(text): return len(enc.encode(text)) except Exception: def count_tokens(text): return len(text) // 4 # --- Build Index --- def build_index(): index = faiss.IndexFlatIP(384) meta = [] def chunk_text(text, size=800, overlap=120): chunks = [] i = 0 while i < len(text): chunks.append(text[i:i+size].strip()) i += size - overlap return chunks for domain_dir in glob.glob(f"{RAW_DIR}/*"): domain = os.path.basename(domain_dir) for path in glob.glob(f"{domain_dir}/*.txt"): with open(path, encoding="utf-8") as f: text = f.read() chunks = chunk_text(text) vecs = embedder.encode(chunks, normalize_embeddings=True) index.add(np.array(vecs).astype("float32")) for ch in chunks: meta.append({"domain": domain, "text": ch, "source": os.path.basename(path)}) print(f"✅ Indexed {domain}/{os.path.basename(path)} ({len(chunks)} chunks)") faiss.write_index(index, INDEX_PATH) json.dump(meta, open(META_PATH, "w")) print(f"🎉 Index built: {len(meta)} chunks total.") return index, meta # Load or build index if not os.path.exists(INDEX_PATH): index, meta = build_index() else: index = faiss.read_index(INDEX_PATH) meta = json.load(open(META_PATH)) # --- Retrieval --- def retrieve_text(query, topk=TOP_K_RESULTS): qvec = embedder.encode([query], normalize_embeddings=True).astype("float32") D, I = index.search(qvec, topk) return [meta[i] for i in I[0]] # --- Token limiter --- def trim_to_token_limit(text, max_tokens=MAX_TOKENS): tokens = count_tokens(text) if tokens > max_tokens: print(f"⚠️ Context too long ({tokens}). Trimming...") cutoff_ratio = max_tokens / tokens text = text[:int(len(text) * cutoff_ratio)] return text # --- Main Answer Generator --- def generate_answer(query, mode): retrieved = retrieve_text(query) combined = " ".join([r["text"] for r in retrieved]) safe_context = trim_to_token_limit(combined) if mode == "Quick Summary (Offline)": summary = summarizer(safe_context, max_length=180, min_length=60, do_sample=False)[0]["summary_text"] else: prompt = f""" You are MindMesh, a cross-domain reasoning assistant. Question: {query} Context: {safe_context} Synthesize a precise and insightful answer across disciplines. """ try: response = client.chat.completions.create( model=PRIMARY_GROQ_MODEL, messages=[{"role": "user", "content": prompt}], ) summary = response.choices[0].message.content.strip() except Exception as e: try: response = client.chat.completions.create( model=FALLBACK_GROQ_MODEL, messages=[{"role": "user", "content": prompt}], ) summary = response.choices[0].message.content.strip() except Exception as e2: summary = f"⚠️ Groq API error: {str(e2)}" md = f"## 🧭 Synthesized Insight\n{summary}\n\n---\n### 🔍 Source Highlights\n" for r in retrieved: md += f"**{r['domain'].title()} — {r['source']}** \n{r['text'][:300]}...\n\n" return md # --- Rebuild Index with Feedback --- def rebuild(): yield "⚙️ Rebuilding FAISS index... please wait ⏳" build_index() yield "✅ Index rebuilt successfully! (FAISS + metadata updated)"