|
|
import os, glob, json, faiss, numpy as np |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from transformers import pipeline |
|
|
from groq import Groq |
|
|
from src.config import * |
|
|
|
|
|
|
|
|
os.makedirs(INDEX_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
embedder = SentenceTransformer(EMBEDDING_MODEL) |
|
|
summarizer = pipeline("summarization", model=SUMMARIZER_MODEL) |
|
|
client = Groq(api_key=os.getenv("GROQ_API_KEY")) |
|
|
|
|
|
|
|
|
try: |
|
|
import tiktoken |
|
|
enc = tiktoken.encoding_for_model("gpt-3.5-turbo") |
|
|
def count_tokens(text): return len(enc.encode(text)) |
|
|
except Exception: |
|
|
def count_tokens(text): return len(text) // 4 |
|
|
|
|
|
|
|
|
def build_index(): |
|
|
index = faiss.IndexFlatIP(384) |
|
|
meta = [] |
|
|
|
|
|
def chunk_text(text, size=800, overlap=120): |
|
|
chunks = [] |
|
|
i = 0 |
|
|
while i < len(text): |
|
|
chunks.append(text[i:i+size].strip()) |
|
|
i += size - overlap |
|
|
return chunks |
|
|
|
|
|
for domain_dir in glob.glob(f"{RAW_DIR}/*"): |
|
|
domain = os.path.basename(domain_dir) |
|
|
for path in glob.glob(f"{domain_dir}/*.txt"): |
|
|
with open(path, encoding="utf-8") as f: |
|
|
text = f.read() |
|
|
chunks = chunk_text(text) |
|
|
vecs = embedder.encode(chunks, normalize_embeddings=True) |
|
|
index.add(np.array(vecs).astype("float32")) |
|
|
for ch in chunks: |
|
|
meta.append({"domain": domain, "text": ch, "source": os.path.basename(path)}) |
|
|
print(f"β
Indexed {domain}/{os.path.basename(path)} ({len(chunks)} chunks)") |
|
|
|
|
|
faiss.write_index(index, INDEX_PATH) |
|
|
json.dump(meta, open(META_PATH, "w")) |
|
|
print(f"π Index built: {len(meta)} chunks total.") |
|
|
return index, meta |
|
|
|
|
|
|
|
|
if not os.path.exists(INDEX_PATH): |
|
|
index, meta = build_index() |
|
|
else: |
|
|
index = faiss.read_index(INDEX_PATH) |
|
|
meta = json.load(open(META_PATH)) |
|
|
|
|
|
|
|
|
def retrieve_text(query, topk=TOP_K_RESULTS): |
|
|
qvec = embedder.encode([query], normalize_embeddings=True).astype("float32") |
|
|
D, I = index.search(qvec, topk) |
|
|
return [meta[i] for i in I[0]] |
|
|
|
|
|
|
|
|
def trim_to_token_limit(text, max_tokens=MAX_TOKENS): |
|
|
tokens = count_tokens(text) |
|
|
if tokens > max_tokens: |
|
|
print(f"β οΈ Context too long ({tokens}). Trimming...") |
|
|
cutoff_ratio = max_tokens / tokens |
|
|
text = text[:int(len(text) * cutoff_ratio)] |
|
|
return text |
|
|
|
|
|
|
|
|
def generate_answer(query, mode): |
|
|
retrieved = retrieve_text(query) |
|
|
combined = " ".join([r["text"] for r in retrieved]) |
|
|
safe_context = trim_to_token_limit(combined) |
|
|
|
|
|
if mode == "Quick Summary (Offline)": |
|
|
summary = summarizer(safe_context, max_length=180, min_length=60, do_sample=False)[0]["summary_text"] |
|
|
else: |
|
|
prompt = f""" |
|
|
You are MindMesh, a cross-domain reasoning assistant. |
|
|
Question: {query} |
|
|
Context: {safe_context} |
|
|
Synthesize a precise and insightful answer across disciplines. |
|
|
""" |
|
|
try: |
|
|
response = client.chat.completions.create( |
|
|
model=PRIMARY_GROQ_MODEL, |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
) |
|
|
summary = response.choices[0].message.content.strip() |
|
|
except Exception as e: |
|
|
try: |
|
|
response = client.chat.completions.create( |
|
|
model=FALLBACK_GROQ_MODEL, |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
) |
|
|
summary = response.choices[0].message.content.strip() |
|
|
except Exception as e2: |
|
|
summary = f"β οΈ Groq API error: {str(e2)}" |
|
|
|
|
|
md = f"## π§ Synthesized Insight\n{summary}\n\n---\n### π Source Highlights\n" |
|
|
for r in retrieved: |
|
|
md += f"**{r['domain'].title()} β {r['source']}** \n{r['text'][:300]}...\n\n" |
|
|
return md |
|
|
|
|
|
|
|
|
def rebuild(): |
|
|
yield "βοΈ Rebuilding FAISS index... please wait β³" |
|
|
build_index() |
|
|
yield "β
Index rebuilt successfully! (FAISS + metadata updated)" |