Shriyakupp commited on
Commit
e8f281a
Β·
verified Β·
1 Parent(s): b0be5b8

Upload 10 files

Browse files
Files changed (10) hide show
  1. .gitignore +4 -0
  2. README.md +9 -10
  3. abc.py +7 -0
  4. api.py +154 -0
  5. auth.json +1 -0
  6. customProvider.js +16 -0
  7. metadata.json +0 -0
  8. pf.yaml +74 -0
  9. requirements.txt +7 -0
  10. semantic_search_local.py +177 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .env
2
+ __pycache__
3
+
4
+ .vercel
README.md CHANGED
@@ -1,11 +1,10 @@
1
- ---
2
- title: Iitm Scraper
3
- emoji: πŸŒ–
4
- colorFrom: blue
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
1
+ To get started, set your OPENAI_API_KEY environment variable, or other required keys for the providers you selected.
 
 
 
 
 
 
 
 
2
 
3
+ Next, edit promptfooconfig.yaml.
4
+
5
+ Then run:
6
+ ```
7
+ promptfoo eval
8
+ ```
9
+
10
+ Afterwards, you can view the results by running `promptfoo view`
abc.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ url = "http://127.0.0.1:8000/api"
4
+ data = {"question": "What is AIPIPE?", "api_key": "your-api-key"}
5
+
6
+ response = requests.post(url, json=data)
7
+ print(response.text)
api.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uvicorn
3
+ import requests
4
+ import json
5
+ import numpy as np
6
+ import faiss
7
+ from dotenv import load_dotenv
8
+ from collections import defaultdict
9
+ from fastapi import FastAPI, HTTPException, Request
10
+ from pydantic import BaseModel
11
+ from sentence_transformers import SentenceTransformer
12
+
13
+ # Initialize FastAPI
14
+ app = FastAPI()
15
+
16
+ # --- Load Environment Variables ---
17
+ load_dotenv()
18
+ api_key = os.getenv("AIPIPE_API_KEY")
19
+
20
+ if not api_key:
21
+ raise RuntimeError("Missing API key in environment variables.")
22
+
23
+ # --- Load Discourse Data ---
24
+ try:
25
+ with open("data/discourse_posts.json", "r", encoding="utf-8") as f:
26
+ posts_data = json.load(f)
27
+ except FileNotFoundError:
28
+ raise RuntimeError("Could not find 'data/discourse_posts.json'. Ensure the file is in the correct location.")
29
+
30
+ # Group posts by topic
31
+ topics = defaultdict(lambda: {"topic_title": "", "posts": []})
32
+ for post in posts_data:
33
+ tid = post["topic_id"]
34
+ topics[tid]["posts"].append(post)
35
+ if "topic_title" in post:
36
+ topics[tid]["topic_title"] = post["topic_title"]
37
+
38
+ # Sort posts within topics by post_number
39
+ for topic in topics.values():
40
+ topic["posts"].sort(key=lambda x: x.get("post_number", 0))
41
+
42
+ # --- Embedding Setup ---
43
+ def normalize(v):
44
+ norm = np.linalg.norm(v)
45
+ return v / norm if norm != 0 else v
46
+
47
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
48
+ embedding_data = []
49
+ embeddings = []
50
+
51
+ # Process topics for FAISS
52
+ for tid, data in topics.items():
53
+ posts = data["posts"]
54
+ title = data["topic_title"]
55
+ reply_map = defaultdict(list)
56
+ by_number = {}
57
+
58
+ for p in posts:
59
+ pn = p.get("post_number")
60
+ if pn is not None:
61
+ by_number[pn] = p
62
+ parent = p.get("reply_to_post_number")
63
+ reply_map[parent].append(p)
64
+
65
+ def extract(pn):
66
+ collected = []
67
+ def dfs(n):
68
+ if n not in by_number:
69
+ return
70
+ p = by_number[n]
71
+ collected.append(p)
72
+ for child in reply_map.get(n, []):
73
+ dfs(child.get("post_number"))
74
+ dfs(pn)
75
+ return collected
76
+
77
+ roots = [p for p in posts if not p.get("reply_to_post_number")]
78
+ for root in roots:
79
+ root_num = root.get("post_number", 1)
80
+ thread = extract(root_num)
81
+ text = f"Topic: {title}\n\n" + "\n\n---\n\n".join(
82
+ p.get("content", "").strip() for p in thread if p.get("content")
83
+ )
84
+ emb = normalize(embedder.encode(text, convert_to_numpy=True))
85
+ embedding_data.append({
86
+ "topic_id": tid,
87
+ "topic_title": title,
88
+ "root_post_number": root_num,
89
+ "post_numbers": [p.get("post_number") for p in thread],
90
+ "combined_text": text
91
+ })
92
+ embeddings.append(emb)
93
+
94
+ # Create FAISS index
95
+ index = faiss.IndexFlatIP(len(embeddings[0]))
96
+ index.add(np.vstack(embeddings).astype("float32"))
97
+
98
+ # --- API Input Model ---
99
+ class QuestionInput(BaseModel):
100
+ question: str
101
+ image: str = None # Optional image input, unused here
102
+
103
+
104
+
105
+ # --- AIPIPE API Configuration ---
106
+ AIPIPE_URL = "https://your-aipipe-endpoint.com/chat/completions"
107
+ AIPIPE_KEY = api_key
108
+
109
+ def query_aipipe(prompt):
110
+ headers = {"Authorization": f"Bearer {AIPIPE_KEY}", "Content-Type": "application/json"}
111
+ data = {"model": "gpt-4o-mini", "messages": [{"role": "user", "content": prompt}], "temperature": 0.7}
112
+
113
+ response = requests.post(AIPIPE_URL, json=data, headers=headers)
114
+ if response.status_code == 200:
115
+ return response.json()
116
+ else:
117
+ raise HTTPException(status_code=500, detail=f"AIPIPE API error: {response.text}")
118
+
119
+ # --- API Endpoint for Answer Generation ---
120
+ @app.post("/api/")
121
+ async def answer_question(payload: QuestionInput):
122
+ q = payload.question
123
+
124
+ # Ensure query is valid
125
+ if not q:
126
+ raise HTTPException(status_code=400, detail="Question field cannot be empty.")
127
+
128
+ # Search FAISS Index
129
+ q_emb = normalize(embedder.encode(q, convert_to_numpy=True)).astype("float32")
130
+ D, I = index.search(np.array([q_emb]), 3)
131
+
132
+ top_results = []
133
+ for score, idx in zip(D[0], I[0]):
134
+ data = embedding_data[idx]
135
+ top_results.append({
136
+ "score": float(score),
137
+ "text": data["combined_text"],
138
+ "topic_id": data["topic_id"],
139
+ "url": f"https://discourse.onlinedegree.iitm.ac.in/t/{data['topic_id']}"
140
+ })
141
+
142
+ # Generate answer using AIPIPE
143
+ try:
144
+ answer_response = query_aipipe(q)
145
+ answer = answer_response.get("choices", [{}])[0].get("message", {}).get("content", "No response.")
146
+ except Exception as e:
147
+ raise HTTPException(status_code=500, detail=f"Error fetching response from AIPIPE: {str(e)}")
148
+
149
+ links = [{"url": r["url"], "text": r["text"][:120]} for r in top_results]
150
+ return {"answer": answer, "links": links}
151
+
152
+ # --- Run the Server ---
153
+ if __name__ == "__main__":
154
+ uvicorn.run("api:app", reload=True)
auth.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cookies": [{"name": "OTZ", "value": "8118346_34_34__34_", "domain": "accounts.google.com", "path": "/", "expires": 1751996758, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "NID", "value": "524=pGzN3AM4nktYQ9VMLlUJRVGl_-N9oMB6VQ3y5ZzHlJMUjidFhWpx-wj6mD-1yQgvKheuVi9mm7qessL2ykyLjtbKpfTq2WCgynkS95EkkkqXMq7U75UrCWoPbvTRd6veMOI6C6pElgLmxblvDr-LVidfbiS9qSqChtwuOiYKpHWijMwoWaKiTcbanPCEmvqkNeV-rZtfov0MNPt9PNOo7EQZNz9SzosAi1lykwflQWAbaSe9d-W4R95Sbv0kDcbO-_zQ5Y8TFdvc9yH9gpQhuW2X38R8TBVvkyUMDbNhJyfYE0ojJK7lNUx251m2skHFgYuQFEPY1VxD4JbGuQD_oM3V0N9SOW-omyMI3JTL6nhTIXtiAvOjs7y9ya0O3NfcDbttscYZORgMjI-0rvkqsUiN1XxRTWOuhCC_ZB4H5O44LwdGdyr3MKsWTSMC14osSSLzKFQkNeYUOF4chQhDMDNdZnimF3CXbhFKHs4cj3O8SPfUDNThNJzbxcmd1MnhbKkSt0wrVaU9bo5Xu_S2bdKaGtOZnzv3QOgv0C6S1l2-1PZVkvIzzptNITT0ivV3wenZJjfHwNPdlmb4ICMV8UyexG-LtcDTOYReU8z1Lv6v-isvfYBASqLo_g", "domain": ".google.com", "path": "/", "expires": 1765215958.960009, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "SID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4gJFIH3mFLpSqgSCdtlzmkAACgYKAfMSARISFQHGX2MicfviDNpac-kl2qNmcIh3JhoVAUF8yKrvZgwyDZB41fASWRBTKWv30076", "domain": ".google.com", "path": "/", "expires": 1783964850.960194, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "__Secure-1PSID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4SMPilUhML70XfSXFgcXh1gACgYKAc0SARISFQHGX2Mia96MxI3v2UvdIesQ37uf-BoVAUF8yKrUcNT98CQMUjngFXRwODfS0076", "domain": ".google.com", "path": "/", "expires": 1783964850.96023, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PSID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4JC7T22vhP_XXy2jXs53PRQACgYKAfcSARISFQHGX2MiUHTSX37QUhtQc4NSoQ1CABoVAUF8yKoFTOV_FouQBFWA5RF1_EnW0076", "domain": ".google.com", "path": "/", "expires": 1783964850.960261, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "HSID", "value": "AgSi0p7ogPeWgCNUQ", "domain": ".google.com", "path": "/", "expires": 1783964850.960393, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "SSID", "value": "AiCZVzx9BdRDHVheM", "domain": ".google.com", "path": "/", "expires": 1783964850.960421, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "APISID", "value": "R0cKHrXJralSylP4/Aq8bgiMom9f5Sr5f1", "domain": ".google.com", "path": "/", "expires": 1783964850.960458, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "SAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".google.com", "path": "/", "expires": 1783964850.960483, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-1PAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".google.com", "path": "/", "expires": 1783964850.960525, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".google.com", "path": "/", "expires": 1783964850.960551, "httpOnly": false, "secure": true, "sameSite": "None"}, {"name": "ACCOUNT_CHOOSER", "value": "AFx_qI7GYU-n0DopkJTbsr_pk9zQ5pY0thVpNAu6s_a2pDOfodRPrPB1DaoafRhjGyD63vbwDRLJnz2ELcUE1SLgCDDXMAL12rdeRMGWP_t_yViqq6u_CSWnLrNKrMD3MDJIgCKNcDOn", "domain": "accounts.google.com", "path": "/", "expires": 1783964850.960575, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Host-GAPS", "value": "1:6NhIn6plsmu1S2-HryhYFnQ_UEOEqr-P85AR0m7sPJLPI1hm7eKy9JdSzqkk55fEVEcYKKanEYe8AP0AO4ovOSK--_OaEQ:S_Y5W5vYNRLpy9BM", "domain": "accounts.google.com", "path": "/", "expires": 1783964850.960603, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "LSID", "value": "s.IN|s.youtube:g.a000xwjUlyTdYTHzF80gbNT7x_ibthdc2qjxxz_h2QwjYFf7S5T8HvrhsOwZ68zOO8PJMD450QACgYKAbUSARISFQHGX2Mit6FsHavPFqVeUb-IjtvZtRoVAUF8yKrKO2C4Yj00E65ati57Kj360076", "domain": "accounts.google.com", "path": "/", "expires": 1783964851.100308, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Host-1PLSID", "value": "s.IN|s.youtube:g.a000xwjUlyTdYTHzF80gbNT7x_ibthdc2qjxxz_h2QwjYFf7S5T8TV6aAzkWzk8QWXGmDWGxugACgYKAQkSARISFQHGX2MiNB9FZAeOCpB0aeacnGyJYxoVAUF8yKqbz93N6uv5thWilVvXWR4q0076", "domain": "accounts.google.com", "path": "/", "expires": 1783964851.100508, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Host-3PLSID", "value": "s.IN|s.youtube:g.a000xwjUlyTdYTHzF80gbNT7x_ibthdc2qjxxz_h2QwjYFf7S5T8JJw8WRM2HuDa-udrKltuVgACgYKAaISARISFQHGX2Mi0Js4l2JXX2ipUuQ_mdnPchoVAUF8yKqNDyQG3flLgDqxLIyIi74F0076", "domain": "accounts.google.com", "path": "/", "expires": 1783964851.10057, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "__Secure-1PSIDTS", "value": "sidts-CjEB5H03P6P3jGo43x7iUx0MEGc4ah0vRt5yvc9XT00Wr_XUEb43WeZg2WvE5FCf1OqNEAA", "domain": ".youtube.com", "path": "/", "expires": 1780940851.445885, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PSIDTS", "value": "sidts-CjEB5H03P6P3jGo43x7iUx0MEGc4ah0vRt5yvc9XT00Wr_XUEb43WeZg2WvE5FCf1OqNEAA", "domain": ".youtube.com", "path": "/", "expires": 1780940851.446026, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "HSID", "value": "AZaNjsQu3eYQM6N3L", "domain": ".youtube.com", "path": "/", "expires": 1783964851.44606, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "SSID", "value": "ADenAUJPbV1Tx51cQ", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446163, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "APISID", "value": "R0cKHrXJralSylP4/Aq8bgiMom9f5Sr5f1", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446187, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "SAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446213, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-1PAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446235, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446259, "httpOnly": false, "secure": true, "sameSite": "None"}, {"name": "SID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4gJFIH3mFLpSqgSCdtlzmkAACgYKAfMSARISFQHGX2MicfviDNpac-kl2qNmcIh3JhoVAUF8yKrvZgwyDZB41fASWRBTKWv30076", "domain": ".youtube.com", "path": "/", "expires": 1783964851.44628, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "__Secure-1PSID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4SMPilUhML70XfSXFgcXh1gACgYKAc0SARISFQHGX2Mia96MxI3v2UvdIesQ37uf-BoVAUF8yKrUcNT98CQMUjngFXRwODfS0076", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446306, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PSID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4JC7T22vhP_XXy2jXs53PRQACgYKAfcSARISFQHGX2MiUHTSX37QUhtQc4NSoQ1CABoVAUF8yKoFTOV_FouQBFWA5RF1_EnW0076", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446333, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "HSID", "value": "AZaNjsQu3eYQM6N3L", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762236, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "SSID", "value": "ADenAUJPbV1Tx51cQ", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762376, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "APISID", "value": "R0cKHrXJralSylP4/Aq8bgiMom9f5Sr5f1", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762444, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "SAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762477, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-1PAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762502, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762534, "httpOnly": false, "secure": true, "sameSite": "None"}, {"name": "NID", "value": "524=Xm8tnyfiRZdPI-gz8yA_hamoGHU9acfj34QWWhuxcm_rbHeiOL1i6KjHyXk4Adun5DIsKu8N8f37OYW2XYvO6wG6Jyj2AtsNpp78vFllJoC1HHVOVRBIkXG0V21cgTjAZyl2Qcedfrwi7q1X7wVUfhIDUBD4CHF1PTm4YwjW4XAxIRWKwvxSuRClcI8DJTGl5SoIUEj0GGyx", "domain": ".google.co.in", "path": "/", "expires": 1765216051.762559, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "SID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4gJFIH3mFLpSqgSCdtlzmkAACgYKAfMSARISFQHGX2MicfviDNpac-kl2qNmcIh3JhoVAUF8yKrvZgwyDZB41fASWRBTKWv30076", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762587, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "__Secure-1PSID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4SMPilUhML70XfSXFgcXh1gACgYKAc0SARISFQHGX2Mia96MxI3v2UvdIesQ37uf-BoVAUF8yKrUcNT98CQMUjngFXRwODfS0076", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762623, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PSID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4JC7T22vhP_XXy2jXs53PRQACgYKAfcSARISFQHGX2MiUHTSX37QUhtQc4NSoQ1CABoVAUF8yKoFTOV_FouQBFWA5RF1_EnW0076", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762652, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "SIDCC", "value": "AKEyXzUFCfCv1cpXJ_r1W4aMYnWqNNHWO64uf_F90cBkOzDRVQRUxPwaplWxjg_RBU_Wvx3KXw", "domain": ".google.com", "path": "/", "expires": 1780940852.107024, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "__Secure-1PSIDCC", "value": "AKEyXzWjEV-GRcLQ_FrvB4nZJ8eDSjl6fOAeVx7AOmdzpQuSX8PJGZ3z5_IR7zhR9_TUmBc-hg", "domain": ".google.com", "path": "/", "expires": 1780940852.107167, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PSIDCC", "value": "AKEyXzUVBh_9F9t2EXuTR5EhQe8lgmz7BY3W1_q6sY3MHvgOomOpkJpjXpSg1bJSJVc_zW7BTA", "domain": ".google.com", "path": "/", "expires": 1780940852.107219, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "_t", "value": "mYXw3oiXxu4xNIUtU9E2cXmEMB08yK9D461vkL8ijLzEC41lU9SgTUkecDDbqpJwoXPZHivj5gh7dPw5Ye2Z%2FSAk2Rtt8QkXuUKUaDRwWCc4DG9j2Gp%2BHBqjFpB%2F3AyW8BcBlkzE5%2BT8E5Arot2Nu023fhbXkVzHO8GHgLctfRL0VzN2Tgh%2Fn%2F%2FYpFVrTNkufiVSP1krNgf%2BaveFark0yuTgdeF14YCVFArh24%2FAhaX7HnS74ihXzi1CejwMknjrvG3gcWCg%2F2MkeLSBXhEOoOCYJswlc7QOCN9dJjcsWFAJvLu1KJU6f%2FNXIq7vB8%2Ba--xJIIe9ITH%2F7QG26Q--8zIvN%2FMJeDdtZmPKnd8lfg%3D%3D", "domain": "discourse.onlinedegree.iitm.ac.in", "path": "/", "expires": 1754588854.026139, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "_bypass_cache", "value": "true", "domain": "discourse.onlinedegree.iitm.ac.in", "path": "/", "expires": -1, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "_forum_session", "value": "0wLfPer6SiQZetqZXqSbPeQkcqDMKZPZ%2BaXw19BWoHwGPMzSp0YbA%2BOGKr7QrGC7%2Fct8%2BacG4jy7BdBV0zgSiEBxHAKw8kbSmHRaYHfKWN1UyktwqWL7U%2BNhX3aBlzYWBRh2Ym5Pc7DVKllXYHmvx0w6lSXQdot490FdWo1uwLW79sVQYUIc4OUXpGal85Dc9V3kk8vUD38c69qOOd4%2BHt%2F7ABghWKtgKYmfD0Do1zTno%2FZawFW07jIuOpVIeFs15H7%2B2CzpW57m8o2%2B4FvRfxWr8A3iYjqnVKXIOVrXB6EtvL2aduVzSaJSuUTpIggxkX0UH22ymyDg6lGOX08Z%2Fof84GouSoZZNvDc8CXqeVoBH%2BLrh0cNpKD%2FCFpQEQ%3D%3D--QlOaYw03YSQJlGs4--cEB8HOMUUqgtVsgTmKSQGg%3D%3D", "domain": "discourse.onlinedegree.iitm.ac.in", "path": "/", "expires": -1, "httpOnly": true, "secure": true, "sameSite": "Lax"}], "origins": [{"origin": "https://discourse.onlinedegree.iitm.ac.in", "localStorage": [{"name": "__mbLastAjax", "value": "1749404907183"}, {"name": "discourse_push_notifications_subscribed-21879", "value": ""}, {"name": "discourse_sidebar-section-tags-collapsed", "value": "false"}, {"name": "discourse_desktop_notifications_focus-tracker", "value": "788e1b8dfe2b4052b12424ba130b7217"}, {"name": "discourse_sidebar-section-community-collapsed", "value": "false"}, {"name": "discourse_sidebar-section-categories-collapsed", "value": "false"}, {"name": "safeLocalStorage", "value": "true"}]}]}
customProvider.js ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module.exports = class CustomAPIProvider {
2
+ id() {
3
+ return "custom-api"; // Ensure this method correctly returns the provider ID.
4
+ }
5
+
6
+ async callApi(prompt) {
7
+ const response = await fetch("http://127.0.0.1:8000/ask", {
8
+ method: "POST",
9
+ headers: { "Content-Type": "application/json" },
10
+ body: JSON.stringify({ question: prompt }),
11
+ });
12
+
13
+ const data = await response.json();
14
+ return { output: data.answer };
15
+ }
16
+ };
metadata.json ADDED
The diff for this file is too large to render. See raw diff
 
pf.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ providers:
2
+ - id: http://127.0.0.1:8000/api
3
+ label: Local FastAPI
4
+ config:
5
+ url: http://127.0.0.1:8000/api
6
+ method: POST
7
+ headers:
8
+ Content-Type: application/json
9
+ body: |
10
+ {
11
+ "question": "{{prompt}}",
12
+ "api_key": "{{AIPIPE_API_KEY}}"
13
+
14
+ }
15
+
16
+ prompts:
17
+ - "{{prompt}}"
18
+
19
+ tests:
20
+ - name: Model usage confusion
21
+ vars:
22
+ prompt: >
23
+ The question asks to use gpt-3.5-turbo-0125 model, but the AI proxy provided by Anand sir only supports gpt-4o-mini. Should we just use gpt-4o-mini or OpenAI API for gpt-3.5 turbo?
24
+ assert:
25
+ - type: contains
26
+ value: gpt-4o-mini
27
+
28
+ - name: SRS + Bonus display
29
+ vars:
30
+ prompt: >
31
+ If a student scores 10/10 on GA4 as well as a bonus, how would it appear on the dashboard?
32
+ assert:
33
+ - type: contains
34
+ value: bonus mark
35
+
36
+ - name: Docker recommendation
37
+ vars:
38
+ prompt: >
39
+ I know Docker but have not used Podman before. Should I use Docker for this course?
40
+ assert:
41
+ - type: contains
42
+ value: Docker CE
43
+
44
+ - name: TDS Sep 2025 Exam Date
45
+ vars:
46
+ prompt: >
47
+ When is the TDS Sep 2025 end-term exam?
48
+ assert:
49
+ - type: contains
50
+ value: date
51
+
52
+ - name: OpenAI API key validation
53
+ vars:
54
+ prompt: >
55
+ I have my OpenAI API key saved in the .env file. How can I ensure it's loaded correctly?
56
+ assert:
57
+ - type: contains
58
+ value: os.getenv("OPENAI_API_KEY")
59
+
60
+ - name: FastAPI server issue
61
+ vars:
62
+ prompt: >
63
+ My FastAPI server is running, but hitting 127.0.0.1:8000 returns "Not Found." What's wrong?
64
+ assert:
65
+ - type: contains
66
+ value: No route defined for "/"
67
+
68
+ - name: Promptfoo response validation
69
+ vars:
70
+ prompt: >
71
+ How can I validate my Promptfoo configuration file?
72
+ assert:
73
+ - type: contains
74
+ value: promptfoo validate pf.yaml
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ sentence-transformers
3
+ transformers
4
+ torch
5
+ faiss-cpu
6
+ tqdm
7
+
semantic_search_local.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # semantic_search_pipeline.py
2
+
3
+ import json
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+ from collections import defaultdict
7
+ from sentence_transformers import SentenceTransformer
8
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
9
+ import faiss
10
+
11
+ # --- Utility functions ---
12
+ def clean_text(text):
13
+ return " ".join(text.strip().split()) if text else ""
14
+
15
+ def normalize(v):
16
+ norm = np.linalg.norm(v)
17
+ return v / norm if norm != 0 else v
18
+
19
+ # --- Load posts ---
20
+ with open("data/discourse_posts.json", "r", encoding="utf-8") as f:
21
+ posts_data = json.load(f)
22
+
23
+ print(f"βœ… Loaded {len(posts_data)} posts")
24
+
25
+ # πŸ”§ Fix missing 'post_number'
26
+ grouped = defaultdict(list)
27
+ for post in posts_data:
28
+ grouped[post["topic_id"]].append(post)
29
+
30
+ for topic_id, posts in grouped.items():
31
+ for i, post in enumerate(posts, start=1):
32
+ post.setdefault("post_number", i)
33
+
34
+ # --- Group by topic_id ---
35
+ topics = defaultdict(lambda: {"topic_title": "", "posts": []})
36
+ for post in posts_data:
37
+ tid = post["topic_id"]
38
+ topics[tid]["posts"].append(post)
39
+ if "topic_title" in post:
40
+ topics[tid]["topic_title"] = post["topic_title"]
41
+
42
+ for topic in topics.values():
43
+ topic["posts"].sort(key=lambda x: x.get("post_number", 0))
44
+
45
+ print(f"βœ… Grouped into {len(topics)} topics")
46
+
47
+ # --- Embedding Model ---
48
+ model_name = "all-MiniLM-L6-v2" # Or "GritLM/GritLM-8x7B"
49
+ embedder = SentenceTransformer(model_name)
50
+
51
+ # --- Build reply tree ---
52
+ def build_reply_map(posts):
53
+ reply_map = defaultdict(list)
54
+ posts_by_number = {}
55
+ for post in posts:
56
+ num = post.get("post_number")
57
+ if num is None:
58
+ continue
59
+ posts_by_number[num] = post
60
+ parent = post.get("reply_to_post_number")
61
+ reply_map[parent].append(post)
62
+ return reply_map, posts_by_number
63
+
64
+ def extract_subthread(root_num, reply_map, posts_by_number):
65
+ collected = []
66
+ def dfs(pn):
67
+ if pn not in posts_by_number:
68
+ return
69
+ p = posts_by_number[pn]
70
+ collected.append(p)
71
+ for child in reply_map.get(pn, []):
72
+ dfs(child["post_number"])
73
+ dfs(root_num)
74
+ return collected
75
+
76
+ # --- Embed subthreads ---
77
+ embedding_data = []
78
+ embeddings = []
79
+
80
+ print("πŸ”„ Building subthread embeddings...")
81
+
82
+ for tid, data in tqdm(topics.items()):
83
+ posts = data["posts"]
84
+ title = data["topic_title"]
85
+ reply_map, by_number = build_reply_map(posts)
86
+
87
+ root_posts = [p for p in posts if not p.get("reply_to_post_number")]
88
+
89
+ if not root_posts:
90
+ print(f"⚠️ No root posts found for topic ID {tid}. Skipping.")
91
+ continue
92
+
93
+ for root in root_posts:
94
+ if "post_number" not in root:
95
+ print(f"⚠️ Skipping root post due to missing 'post_number': {root}")
96
+ continue
97
+ root_num = root["post_number"]
98
+
99
+ subthread = extract_subthread(root_num, reply_map, by_number)
100
+ combined = f"Topic: {title}\n\n" + "\n\n---\n\n".join(
101
+ clean_text(p["content"]) for p in subthread if "content" in p
102
+ )
103
+
104
+ emb = embedder.encode(combined, convert_to_numpy=True)
105
+ emb = normalize(emb)
106
+
107
+ embedding_data.append({
108
+ "topic_id": tid,
109
+ "topic_title": title,
110
+ "root_post_number": root_num,
111
+ "post_numbers": [p["post_number"] for p in subthread if "post_number" in p],
112
+ "combined_text": combined
113
+ })
114
+ embeddings.append(emb)
115
+
116
+ if not embeddings:
117
+ print("❌ No embeddings were generated. Exiting.")
118
+ exit()
119
+
120
+ embeddings = np.vstack(embeddings).astype("float32")
121
+
122
+ # --- Build FAISS index ---
123
+ dim = embeddings.shape[1]
124
+ index = faiss.IndexFlatIP(dim)
125
+ index.add(embeddings)
126
+
127
+ print(f"βœ… Indexed {len(embedding_data)} subthreads")
128
+
129
+ # --- Semantic retrieval ---
130
+ def retrieve(query, top_k=5):
131
+ q_emb = embedder.encode(query, convert_to_numpy=True)
132
+ q_emb = normalize(q_emb).astype("float32")
133
+ D, I = index.search(np.array([q_emb]), top_k)
134
+
135
+ results = []
136
+ for score, idx in zip(D[0], I[0]):
137
+ data = embedding_data[idx]
138
+ results.append({
139
+ "score": float(score),
140
+ "topic_id": data["topic_id"],
141
+ "topic_title": data["topic_title"],
142
+ "root_post_number": data["root_post_number"],
143
+ "post_numbers": data["post_numbers"],
144
+ "combined_text": data["combined_text"],
145
+ })
146
+ return results
147
+
148
+ # --- QA generation using T5 ---
149
+ gen_model_name = "google/flan-t5-base"
150
+
151
+ tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
152
+ qa_model = AutoModelForSeq2SeqLM.from_pretrained(gen_model_name)
153
+
154
+ def generate_answer(query, contexts, max_len=256):
155
+ context = "\n\n".join(contexts)
156
+ prompt = f"Answer the question based on the following forum discussion:\n\n{context}\n\nQuestion: {query}\nAnswer:"
157
+ inputs = tokenizer(prompt, return_tensors="pt", max_length=4096, truncation=True)
158
+ outputs = qa_model.generate(**inputs, max_length=max_len, num_beams=5, early_stopping=True)
159
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
160
+
161
+ # --- Run Example ---
162
+ if __name__ == "__main__":
163
+ query = "If a student scores 10/10 on GA4 as well as a bonus, how would it appear on the dashboard?"
164
+
165
+ results = retrieve(query, top_k=3)
166
+
167
+ print("\nπŸ” Top Retrieved Threads:")
168
+ for i, r in enumerate(results, 1):
169
+ print(f"\n[{i}] Score: {r['score']:.4f}")
170
+ print(f"Topic Title: {r['topic_title']}")
171
+ print(f"Root Post #: {r['root_post_number']} | Post IDs: {r['post_numbers']}")
172
+ print(f"Snippet:\n{r['combined_text'][:300]}...\n")
173
+
174
+ contexts = [r["combined_text"] for r in results]
175
+ answer = generate_answer(query, contexts)
176
+
177
+ print("\nπŸ’‘ Generated Answer:\n", answer)