raviix46 commited on
Commit
5785ed4
·
verified ·
1 Parent(s): 05d3b70

Create logic.py

Browse files
Files changed (1) hide show
  1. src/logic.py +115 -0
src/logic.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, glob, json, faiss, numpy as np
2
+ from sentence_transformers import SentenceTransformer
3
+ from transformers import pipeline
4
+ from groq import Groq
5
+ from src.config import *
6
+
7
+ # Ensure directories exist
8
+ os.makedirs(INDEX_DIR, exist_ok=True)
9
+
10
+ # Initialize models
11
+ embedder = SentenceTransformer(EMBEDDING_MODEL)
12
+ summarizer = pipeline("summarization", model=SUMMARIZER_MODEL)
13
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
14
+
15
+ # --- Token Counter ---
16
+ try:
17
+ import tiktoken
18
+ enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
19
+ def count_tokens(text): return len(enc.encode(text))
20
+ except Exception:
21
+ def count_tokens(text): return len(text) // 4
22
+
23
+ # --- Build Index ---
24
+ def build_index():
25
+ index = faiss.IndexFlatIP(384)
26
+ meta = []
27
+
28
+ def chunk_text(text, size=800, overlap=120):
29
+ chunks = []
30
+ i = 0
31
+ while i < len(text):
32
+ chunks.append(text[i:i+size].strip())
33
+ i += size - overlap
34
+ return chunks
35
+
36
+ for domain_dir in glob.glob(f"{RAW_DIR}/*"):
37
+ domain = os.path.basename(domain_dir)
38
+ for path in glob.glob(f"{domain_dir}/*.txt"):
39
+ with open(path, encoding="utf-8") as f:
40
+ text = f.read()
41
+ chunks = chunk_text(text)
42
+ vecs = embedder.encode(chunks, normalize_embeddings=True)
43
+ index.add(np.array(vecs).astype("float32"))
44
+ for ch in chunks:
45
+ meta.append({"domain": domain, "text": ch, "source": os.path.basename(path)})
46
+ print(f"✅ Indexed {domain}/{os.path.basename(path)} ({len(chunks)} chunks)")
47
+
48
+ faiss.write_index(index, INDEX_PATH)
49
+ json.dump(meta, open(META_PATH, "w"))
50
+ print(f"🎉 Index built: {len(meta)} chunks total.")
51
+ return index, meta
52
+
53
+ # Load or build index
54
+ if not os.path.exists(INDEX_PATH):
55
+ index, meta = build_index()
56
+ else:
57
+ index = faiss.read_index(INDEX_PATH)
58
+ meta = json.load(open(META_PATH))
59
+
60
+ # --- Retrieval ---
61
+ def retrieve_text(query, topk=TOP_K_RESULTS):
62
+ qvec = embedder.encode([query], normalize_embeddings=True).astype("float32")
63
+ D, I = index.search(qvec, topk)
64
+ return [meta[i] for i in I[0]]
65
+
66
+ # --- Token limiter ---
67
+ def trim_to_token_limit(text, max_tokens=MAX_TOKENS):
68
+ tokens = count_tokens(text)
69
+ if tokens > max_tokens:
70
+ print(f"⚠️ Context too long ({tokens}). Trimming...")
71
+ cutoff_ratio = max_tokens / tokens
72
+ text = text[:int(len(text) * cutoff_ratio)]
73
+ return text
74
+
75
+ # --- Main Answer Generator ---
76
+ def generate_answer(query, mode):
77
+ retrieved = retrieve_text(query)
78
+ combined = " ".join([r["text"] for r in retrieved])
79
+ safe_context = trim_to_token_limit(combined)
80
+
81
+ if mode == "Quick Summary (Offline)":
82
+ summary = summarizer(safe_context, max_length=180, min_length=60, do_sample=False)[0]["summary_text"]
83
+ else:
84
+ prompt = f"""
85
+ You are MindMesh, a cross-domain reasoning assistant.
86
+ Question: {query}
87
+ Context: {safe_context}
88
+ Synthesize a precise and insightful answer across disciplines.
89
+ """
90
+ try:
91
+ response = client.chat.completions.create(
92
+ model=PRIMARY_GROQ_MODEL,
93
+ messages=[{"role": "user", "content": prompt}],
94
+ )
95
+ summary = response.choices[0].message.content.strip()
96
+ except Exception as e:
97
+ try:
98
+ response = client.chat.completions.create(
99
+ model=FALLBACK_GROQ_MODEL,
100
+ messages=[{"role": "user", "content": prompt}],
101
+ )
102
+ summary = response.choices[0].message.content.strip()
103
+ except Exception as e2:
104
+ summary = f"⚠️ Groq API error: {str(e2)}"
105
+
106
+ md = f"## 🧭 Synthesized Insight\n{summary}\n\n---\n### 🔍 Source Highlights\n"
107
+ for r in retrieved:
108
+ md += f"**{r['domain'].title()} — {r['source']}** \n{r['text'][:300]}...\n\n"
109
+ return md
110
+
111
+ # --- Rebuild Index with Feedback ---
112
+ def rebuild():
113
+ yield "⚙️ Rebuilding FAISS index... please wait ⏳"
114
+ build_index()
115
+ yield "✅ Index rebuilt successfully! (FAISS + metadata updated)"