Spaces:

Shriyakupp
/

iitm_scraper

Sleeping

App Files Files Community

Shriyakupp commited on Jun 11, 2025

Commit

e8f281a

verified ·

1 Parent(s): b0be5b8

Upload 10 files

Browse files

Files changed (10) hide show

.gitignore +4 -0
README.md +9 -10
abc.py +7 -0
api.py +154 -0
auth.json +1 -0
customProvider.js +16 -0
metadata.json +0 -0
pf.yaml +74 -0
requirements.txt +7 -0
semantic_search_local.py +177 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.env
+__pycache__
+.vercel

README.md CHANGED Viewed

@@ -1,11 +1,10 @@
----
-title: Iitm Scraper
-emoji: 🌖
-colorFrom: blue
-colorTo: yellow
-sdk: docker
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+To get started, set your OPENAI_API_KEY environment variable, or other required keys for the providers you selected.
+Next, edit promptfooconfig.yaml.
+Then run:
+```
+promptfoo eval
+```
+Afterwards, you can view the results by running `promptfoo view`

abc.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import requests
+url = "http://127.0.0.1:8000/api"
+data = {"question": "What is AIPIPE?", "api_key": "your-api-key"}
+response = requests.post(url, json=data)
+print(response.text)

api.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import os
+import uvicorn
+import requests
+import json
+import numpy as np
+import faiss
+from dotenv import load_dotenv
+from collections import defaultdict
+from fastapi import FastAPI, HTTPException, Request
+from pydantic import BaseModel
+from sentence_transformers import SentenceTransformer
+# Initialize FastAPI
+app = FastAPI()
+# --- Load Environment Variables ---
+load_dotenv()
+api_key = os.getenv("AIPIPE_API_KEY")
+if not api_key:
+    raise RuntimeError("Missing API key in environment variables.")
+# --- Load Discourse Data ---
+try:
+    with open("data/discourse_posts.json", "r", encoding="utf-8") as f:
+        posts_data = json.load(f)
+except FileNotFoundError:
+    raise RuntimeError("Could not find 'data/discourse_posts.json'. Ensure the file is in the correct location.")
+# Group posts by topic
+topics = defaultdict(lambda: {"topic_title": "", "posts": []})
+for post in posts_data:
+    tid = post["topic_id"]
+    topics[tid]["posts"].append(post)
+    if "topic_title" in post:
+        topics[tid]["topic_title"] = post["topic_title"]
+# Sort posts within topics by post_number
+for topic in topics.values():
+    topic["posts"].sort(key=lambda x: x.get("post_number", 0))
+# --- Embedding Setup ---
+def normalize(v):
+    norm = np.linalg.norm(v)
+    return v / norm if norm != 0 else v
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+embedding_data = []
+embeddings = []
+# Process topics for FAISS
+for tid, data in topics.items():
+    posts = data["posts"]
+    title = data["topic_title"]
+    reply_map = defaultdict(list)
+    by_number = {}
+    for p in posts:
+        pn = p.get("post_number")
+        if pn is not None:
+            by_number[pn] = p
+        parent = p.get("reply_to_post_number")
+        reply_map[parent].append(p)
+    def extract(pn):
+        collected = []
+        def dfs(n):
+            if n not in by_number:
+                return
+            p = by_number[n]
+            collected.append(p)
+            for child in reply_map.get(n, []):
+                dfs(child.get("post_number"))
+        dfs(pn)
+        return collected
+    roots = [p for p in posts if not p.get("reply_to_post_number")]
+    for root in roots:
+        root_num = root.get("post_number", 1)
+        thread = extract(root_num)
+        text = f"Topic: {title}\n\n" + "\n\n---\n\n".join(
+            p.get("content", "").strip() for p in thread if p.get("content")
+        )
+        emb = normalize(embedder.encode(text, convert_to_numpy=True))
+        embedding_data.append({
+            "topic_id": tid,
+            "topic_title": title,
+            "root_post_number": root_num,
+            "post_numbers": [p.get("post_number") for p in thread],
+            "combined_text": text
+        })
+        embeddings.append(emb)
+# Create FAISS index
+index = faiss.IndexFlatIP(len(embeddings[0]))
+index.add(np.vstack(embeddings).astype("float32"))
+# --- API Input Model ---
+class QuestionInput(BaseModel):
+    question: str
+    image: str = None  # Optional image input, unused here
+# --- AIPIPE API Configuration ---
+AIPIPE_URL = "https://your-aipipe-endpoint.com/chat/completions"
+AIPIPE_KEY = api_key
+def query_aipipe(prompt):
+    headers = {"Authorization": f"Bearer {AIPIPE_KEY}", "Content-Type": "application/json"}
+    data = {"model": "gpt-4o-mini", "messages": [{"role": "user", "content": prompt}], "temperature": 0.7}
+    response = requests.post(AIPIPE_URL, json=data, headers=headers)
+    if response.status_code == 200:
+        return response.json()
+    else:
+        raise HTTPException(status_code=500, detail=f"AIPIPE API error: {response.text}")
+# --- API Endpoint for Answer Generation ---
+@app.post("/api/")
+async def answer_question(payload: QuestionInput):
+    q = payload.question
+    # Ensure query is valid
+    if not q:
+        raise HTTPException(status_code=400, detail="Question field cannot be empty.")
+    # Search FAISS Index
+    q_emb = normalize(embedder.encode(q, convert_to_numpy=True)).astype("float32")
+    D, I = index.search(np.array([q_emb]), 3)
+    top_results = []
+    for score, idx in zip(D[0], I[0]):
+        data = embedding_data[idx]
+        top_results.append({
+            "score": float(score),
+            "text": data["combined_text"],
+            "topic_id": data["topic_id"],
+            "url": f"https://discourse.onlinedegree.iitm.ac.in/t/{data['topic_id']}"
+        })
+    # Generate answer using AIPIPE
+    try:
+        answer_response = query_aipipe(q)
+        answer = answer_response.get("choices", [{}])[0].get("message", {}).get("content", "No response.")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error fetching response from AIPIPE: {str(e)}")
+    links = [{"url": r["url"], "text": r["text"][:120]} for r in top_results]
+    return {"answer": answer, "links": links}
+# --- Run the Server ---
+if __name__ == "__main__":
+    uvicorn.run("api:app", reload=True)

auth.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"cookies": [{"name": "OTZ", "value": "8118346_34_34__34_", "domain": "accounts.google.com", "path": "/", "expires": 1751996758, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "NID", "value": "524=pGzN3AM4nktYQ9VMLlUJRVGl_-N9oMB6VQ3y5ZzHlJMUjidFhWpx-wj6mD-1yQgvKheuVi9mm7qessL2ykyLjtbKpfTq2WCgynkS95EkkkqXMq7U75UrCWoPbvTRd6veMOI6C6pElgLmxblvDr-LVidfbiS9qSqChtwuOiYKpHWijMwoWaKiTcbanPCEmvqkNeV-rZtfov0MNPt9PNOo7EQZNz9SzosAi1lykwflQWAbaSe9d-W4R95Sbv0kDcbO-_zQ5Y8TFdvc9yH9gpQhuW2X38R8TBVvkyUMDbNhJyfYE0ojJK7lNUx251m2skHFgYuQFEPY1VxD4JbGuQD_oM3V0N9SOW-omyMI3JTL6nhTIXtiAvOjs7y9ya0O3NfcDbttscYZORgMjI-0rvkqsUiN1XxRTWOuhCC_ZB4H5O44LwdGdyr3MKsWTSMC14osSSLzKFQkNeYUOF4chQhDMDNdZnimF3CXbhFKHs4cj3O8SPfUDNThNJzbxcmd1MnhbKkSt0wrVaU9bo5Xu_S2bdKaGtOZnzv3QOgv0C6S1l2-1PZVkvIzzptNITT0ivV3wenZJjfHwNPdlmb4ICMV8UyexG-LtcDTOYReU8z1Lv6v-isvfYBASqLo_g", "domain": ".google.com", "path": "/", "expires": 1765215958.960009, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "SID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4gJFIH3mFLpSqgSCdtlzmkAACgYKAfMSARISFQHGX2MicfviDNpac-kl2qNmcIh3JhoVAUF8yKrvZgwyDZB41fASWRBTKWv30076", "domain": ".google.com", "path": "/", "expires": 1783964850.960194, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "__Secure-1PSID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4SMPilUhML70XfSXFgcXh1gACgYKAc0SARISFQHGX2Mia96MxI3v2UvdIesQ37uf-BoVAUF8yKrUcNT98CQMUjngFXRwODfS0076", "domain": ".google.com", "path": "/", "expires": 1783964850.96023, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PSID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4JC7T22vhP_XXy2jXs53PRQACgYKAfcSARISFQHGX2MiUHTSX37QUhtQc4NSoQ1CABoVAUF8yKoFTOV_FouQBFWA5RF1_EnW0076", "domain": ".google.com", "path": "/", "expires": 1783964850.960261, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "HSID", "value": "AgSi0p7ogPeWgCNUQ", "domain": ".google.com", "path": "/", "expires": 1783964850.960393, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "SSID", "value": "AiCZVzx9BdRDHVheM", "domain": ".google.com", "path": "/", "expires": 1783964850.960421, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "APISID", "value": "R0cKHrXJralSylP4/Aq8bgiMom9f5Sr5f1", "domain": ".google.com", "path": "/", "expires": 1783964850.960458, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "SAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".google.com", "path": "/", "expires": 1783964850.960483, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-1PAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".google.com", "path": "/", "expires": 1783964850.960525, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".google.com", "path": "/", "expires": 1783964850.960551, "httpOnly": false, "secure": true, "sameSite": "None"}, {"name": "ACCOUNT_CHOOSER", "value": "AFx_qI7GYU-n0DopkJTbsr_pk9zQ5pY0thVpNAu6s_a2pDOfodRPrPB1DaoafRhjGyD63vbwDRLJnz2ELcUE1SLgCDDXMAL12rdeRMGWP_t_yViqq6u_CSWnLrNKrMD3MDJIgCKNcDOn", "domain": "accounts.google.com", "path": "/", "expires": 1783964850.960575, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Host-GAPS", "value": "1:6NhIn6plsmu1S2-HryhYFnQ_UEOEqr-P85AR0m7sPJLPI1hm7eKy9JdSzqkk55fEVEcYKKanEYe8AP0AO4ovOSK--_OaEQ:S_Y5W5vYNRLpy9BM", "domain": "accounts.google.com", "path": "/", "expires": 1783964850.960603, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "LSID", "value": "s.IN|s.youtube:g.a000xwjUlyTdYTHzF80gbNT7x_ibthdc2qjxxz_h2QwjYFf7S5T8HvrhsOwZ68zOO8PJMD450QACgYKAbUSARISFQHGX2Mit6FsHavPFqVeUb-IjtvZtRoVAUF8yKrKO2C4Yj00E65ati57Kj360076", "domain": "accounts.google.com", "path": "/", "expires": 1783964851.100308, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Host-1PLSID", "value": "s.IN|s.youtube:g.a000xwjUlyTdYTHzF80gbNT7x_ibthdc2qjxxz_h2QwjYFf7S5T8TV6aAzkWzk8QWXGmDWGxugACgYKAQkSARISFQHGX2MiNB9FZAeOCpB0aeacnGyJYxoVAUF8yKqbz93N6uv5thWilVvXWR4q0076", "domain": "accounts.google.com", "path": "/", "expires": 1783964851.100508, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Host-3PLSID", "value": "s.IN|s.youtube:g.a000xwjUlyTdYTHzF80gbNT7x_ibthdc2qjxxz_h2QwjYFf7S5T8JJw8WRM2HuDa-udrKltuVgACgYKAaISARISFQHGX2Mi0Js4l2JXX2ipUuQ_mdnPchoVAUF8yKqNDyQG3flLgDqxLIyIi74F0076", "domain": "accounts.google.com", "path": "/", "expires": 1783964851.10057, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "__Secure-1PSIDTS", "value": "sidts-CjEB5H03P6P3jGo43x7iUx0MEGc4ah0vRt5yvc9XT00Wr_XUEb43WeZg2WvE5FCf1OqNEAA", "domain": ".youtube.com", "path": "/", "expires": 1780940851.445885, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PSIDTS", "value": "sidts-CjEB5H03P6P3jGo43x7iUx0MEGc4ah0vRt5yvc9XT00Wr_XUEb43WeZg2WvE5FCf1OqNEAA", "domain": ".youtube.com", "path": "/", "expires": 1780940851.446026, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "HSID", "value": "AZaNjsQu3eYQM6N3L", "domain": ".youtube.com", "path": "/", "expires": 1783964851.44606, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "SSID", "value": "ADenAUJPbV1Tx51cQ", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446163, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "APISID", "value": "R0cKHrXJralSylP4/Aq8bgiMom9f5Sr5f1", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446187, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "SAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446213, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-1PAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446235, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446259, "httpOnly": false, "secure": true, "sameSite": "None"}, {"name": "SID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4gJFIH3mFLpSqgSCdtlzmkAACgYKAfMSARISFQHGX2MicfviDNpac-kl2qNmcIh3JhoVAUF8yKrvZgwyDZB41fASWRBTKWv30076", "domain": ".youtube.com", "path": "/", "expires": 1783964851.44628, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "__Secure-1PSID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4SMPilUhML70XfSXFgcXh1gACgYKAc0SARISFQHGX2Mia96MxI3v2UvdIesQ37uf-BoVAUF8yKrUcNT98CQMUjngFXRwODfS0076", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446306, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PSID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4JC7T22vhP_XXy2jXs53PRQACgYKAfcSARISFQHGX2MiUHTSX37QUhtQc4NSoQ1CABoVAUF8yKoFTOV_FouQBFWA5RF1_EnW0076", "domain": ".youtube.com", "path": "/", "expires": 1783964851.446333, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "HSID", "value": "AZaNjsQu3eYQM6N3L", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762236, "httpOnly": true, "secure": false, "sameSite": "Lax"}, {"name": "SSID", "value": "ADenAUJPbV1Tx51cQ", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762376, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "APISID", "value": "R0cKHrXJralSylP4/Aq8bgiMom9f5Sr5f1", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762444, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "SAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762477, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-1PAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762502, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PAPISID", "value": "qVH55bIibZoiVmPz/AGVEBc3223LCMUWOH", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762534, "httpOnly": false, "secure": true, "sameSite": "None"}, {"name": "NID", "value": "524=Xm8tnyfiRZdPI-gz8yA_hamoGHU9acfj34QWWhuxcm_rbHeiOL1i6KjHyXk4Adun5DIsKu8N8f37OYW2XYvO6wG6Jyj2AtsNpp78vFllJoC1HHVOVRBIkXG0V21cgTjAZyl2Qcedfrwi7q1X7wVUfhIDUBD4CHF1PTm4YwjW4XAxIRWKwvxSuRClcI8DJTGl5SoIUEj0GGyx", "domain": ".google.co.in", "path": "/", "expires": 1765216051.762559, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "SID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4gJFIH3mFLpSqgSCdtlzmkAACgYKAfMSARISFQHGX2MicfviDNpac-kl2qNmcIh3JhoVAUF8yKrvZgwyDZB41fASWRBTKWv30076", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762587, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "__Secure-1PSID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4SMPilUhML70XfSXFgcXh1gACgYKAc0SARISFQHGX2Mia96MxI3v2UvdIesQ37uf-BoVAUF8yKrUcNT98CQMUjngFXRwODfS0076", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762623, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PSID", "value": "g.a000xwjUlyyhWXqZaIMuSE3XUnz3p4-cmKnHiAoN0pT4D6NfP0u4JC7T22vhP_XXy2jXs53PRQACgYKAfcSARISFQHGX2MiUHTSX37QUhtQc4NSoQ1CABoVAUF8yKoFTOV_FouQBFWA5RF1_EnW0076", "domain": ".google.co.in", "path": "/", "expires": 1783964851.762652, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "SIDCC", "value": "AKEyXzUFCfCv1cpXJ_r1W4aMYnWqNNHWO64uf_F90cBkOzDRVQRUxPwaplWxjg_RBU_Wvx3KXw", "domain": ".google.com", "path": "/", "expires": 1780940852.107024, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "__Secure-1PSIDCC", "value": "AKEyXzWjEV-GRcLQ_FrvB4nZJ8eDSjl6fOAeVx7AOmdzpQuSX8PJGZ3z5_IR7zhR9_TUmBc-hg", "domain": ".google.com", "path": "/", "expires": 1780940852.107167, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "__Secure-3PSIDCC", "value": "AKEyXzUVBh_9F9t2EXuTR5EhQe8lgmz7BY3W1_q6sY3MHvgOomOpkJpjXpSg1bJSJVc_zW7BTA", "domain": ".google.com", "path": "/", "expires": 1780940852.107219, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "_t", "value": "mYXw3oiXxu4xNIUtU9E2cXmEMB08yK9D461vkL8ijLzEC41lU9SgTUkecDDbqpJwoXPZHivj5gh7dPw5Ye2Z%2FSAk2Rtt8QkXuUKUaDRwWCc4DG9j2Gp%2BHBqjFpB%2F3AyW8BcBlkzE5%2BT8E5Arot2Nu023fhbXkVzHO8GHgLctfRL0VzN2Tgh%2Fn%2F%2FYpFVrTNkufiVSP1krNgf%2BaveFark0yuTgdeF14YCVFArh24%2FAhaX7HnS74ihXzi1CejwMknjrvG3gcWCg%2F2MkeLSBXhEOoOCYJswlc7QOCN9dJjcsWFAJvLu1KJU6f%2FNXIq7vB8%2Ba--xJIIe9ITH%2F7QG26Q--8zIvN%2FMJeDdtZmPKnd8lfg%3D%3D", "domain": "discourse.onlinedegree.iitm.ac.in", "path": "/", "expires": 1754588854.026139, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "_bypass_cache", "value": "true", "domain": "discourse.onlinedegree.iitm.ac.in", "path": "/", "expires": -1, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "_forum_session", "value": "0wLfPer6SiQZetqZXqSbPeQkcqDMKZPZ%2BaXw19BWoHwGPMzSp0YbA%2BOGKr7QrGC7%2Fct8%2BacG4jy7BdBV0zgSiEBxHAKw8kbSmHRaYHfKWN1UyktwqWL7U%2BNhX3aBlzYWBRh2Ym5Pc7DVKllXYHmvx0w6lSXQdot490FdWo1uwLW79sVQYUIc4OUXpGal85Dc9V3kk8vUD38c69qOOd4%2BHt%2F7ABghWKtgKYmfD0Do1zTno%2FZawFW07jIuOpVIeFs15H7%2B2CzpW57m8o2%2B4FvRfxWr8A3iYjqnVKXIOVrXB6EtvL2aduVzSaJSuUTpIggxkX0UH22ymyDg6lGOX08Z%2Fof84GouSoZZNvDc8CXqeVoBH%2BLrh0cNpKD%2FCFpQEQ%3D%3D--QlOaYw03YSQJlGs4--cEB8HOMUUqgtVsgTmKSQGg%3D%3D", "domain": "discourse.onlinedegree.iitm.ac.in", "path": "/", "expires": -1, "httpOnly": true, "secure": true, "sameSite": "Lax"}], "origins": [{"origin": "https://discourse.onlinedegree.iitm.ac.in", "localStorage": [{"name": "__mbLastAjax", "value": "1749404907183"}, {"name": "discourse_push_notifications_subscribed-21879", "value": ""}, {"name": "discourse_sidebar-section-tags-collapsed", "value": "false"}, {"name": "discourse_desktop_notifications_focus-tracker", "value": "788e1b8dfe2b4052b12424ba130b7217"}, {"name": "discourse_sidebar-section-community-collapsed", "value": "false"}, {"name": "discourse_sidebar-section-categories-collapsed", "value": "false"}, {"name": "safeLocalStorage", "value": "true"}]}]}

customProvider.js ADDED Viewed

	@@ -0,0 +1,16 @@

+module.exports = class CustomAPIProvider {
+  id() {
+    return "custom-api"; // Ensure this method correctly returns the provider ID.
+  }
+  async callApi(prompt) {
+    const response = await fetch("http://127.0.0.1:8000/ask", {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({ question: prompt }),
+    });
+    const data = await response.json();
+    return { output: data.answer };
+  }
+};

metadata.json ADDED Viewed

The diff for this file is too large to render. See raw diff

pf.yaml ADDED Viewed

	@@ -0,0 +1,74 @@

+providers:
+  - id: http://127.0.0.1:8000/api
+    label: Local FastAPI
+    config:
+      url: http://127.0.0.1:8000/api
+      method: POST
+      headers:
+        Content-Type: application/json
+      body: |
+        {
+          "question": "{{prompt}}",
+          "api_key": "{{AIPIPE_API_KEY}}"
+        }
+prompts:
+  - "{{prompt}}"
+tests:
+  - name: Model usage confusion
+    vars:
+      prompt: >
+        The question asks to use gpt-3.5-turbo-0125 model, but the AI proxy provided by Anand sir only supports gpt-4o-mini. Should we just use gpt-4o-mini or OpenAI API for gpt-3.5 turbo?
+    assert:
+      - type: contains
+        value: gpt-4o-mini
+  - name: SRS + Bonus display
+    vars:
+      prompt: >
+        If a student scores 10/10 on GA4 as well as a bonus, how would it appear on the dashboard?
+    assert:
+      - type: contains
+        value: bonus mark
+  - name: Docker recommendation
+    vars:
+      prompt: >
+        I know Docker but have not used Podman before. Should I use Docker for this course?
+    assert:
+      - type: contains
+        value: Docker CE
+  - name: TDS Sep 2025 Exam Date
+    vars:
+      prompt: >
+        When is the TDS Sep 2025 end-term exam?
+    assert:
+      - type: contains
+        value: date
+  - name: OpenAI API key validation
+    vars:
+      prompt: >
+        I have my OpenAI API key saved in the .env file. How can I ensure it's loaded correctly?
+    assert:
+      - type: contains
+        value: os.getenv("OPENAI_API_KEY")
+  - name: FastAPI server issue
+    vars:
+      prompt: >
+        My FastAPI server is running, but hitting 127.0.0.1:8000 returns "Not Found." What's wrong?
+    assert:
+      - type: contains
+        value: No route defined for "/"
+  - name: Promptfoo response validation
+    vars:
+      prompt: >
+        How can I validate my Promptfoo configuration file?
+    assert:
+      - type: contains
+        value: promptfoo validate pf.yaml

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit
+sentence-transformers
+transformers
+torch
+faiss-cpu
+tqdm

semantic_search_local.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# semantic_search_pipeline.py
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import defaultdict
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import faiss
+# --- Utility functions ---
+def clean_text(text):
+    return " ".join(text.strip().split()) if text else ""
+def normalize(v):
+    norm = np.linalg.norm(v)
+    return v / norm if norm != 0 else v
+# --- Load posts ---
+with open("data/discourse_posts.json", "r", encoding="utf-8") as f:
+    posts_data = json.load(f)
+print(f"✅ Loaded {len(posts_data)} posts")
+# 🔧 Fix missing 'post_number'
+grouped = defaultdict(list)
+for post in posts_data:
+    grouped[post["topic_id"]].append(post)
+for topic_id, posts in grouped.items():
+    for i, post in enumerate(posts, start=1):
+        post.setdefault("post_number", i)
+# --- Group by topic_id ---
+topics = defaultdict(lambda: {"topic_title": "", "posts": []})
+for post in posts_data:
+    tid = post["topic_id"]
+    topics[tid]["posts"].append(post)
+    if "topic_title" in post:
+        topics[tid]["topic_title"] = post["topic_title"]
+for topic in topics.values():
+    topic["posts"].sort(key=lambda x: x.get("post_number", 0))
+print(f"✅ Grouped into {len(topics)} topics")
+# --- Embedding Model ---
+model_name = "all-MiniLM-L6-v2"  # Or "GritLM/GritLM-8x7B"
+embedder = SentenceTransformer(model_name)
+# --- Build reply tree ---
+def build_reply_map(posts):
+    reply_map = defaultdict(list)
+    posts_by_number = {}
+    for post in posts:
+        num = post.get("post_number")
+        if num is None:
+            continue
+        posts_by_number[num] = post
+        parent = post.get("reply_to_post_number")
+        reply_map[parent].append(post)
+    return reply_map, posts_by_number
+def extract_subthread(root_num, reply_map, posts_by_number):
+    collected = []
+    def dfs(pn):
+        if pn not in posts_by_number:
+            return
+        p = posts_by_number[pn]
+        collected.append(p)
+        for child in reply_map.get(pn, []):
+            dfs(child["post_number"])
+    dfs(root_num)
+    return collected
+# --- Embed subthreads ---
+embedding_data = []
+embeddings = []
+print("🔄 Building subthread embeddings...")
+for tid, data in tqdm(topics.items()):
+    posts = data["posts"]
+    title = data["topic_title"]
+    reply_map, by_number = build_reply_map(posts)
+    root_posts = [p for p in posts if not p.get("reply_to_post_number")]
+    if not root_posts:
+        print(f"⚠️ No root posts found for topic ID {tid}. Skipping.")
+        continue
+    for root in root_posts:
+        if "post_number" not in root:
+            print(f"⚠️ Skipping root post due to missing 'post_number': {root}")
+            continue
+        root_num = root["post_number"]
+        subthread = extract_subthread(root_num, reply_map, by_number)
+        combined = f"Topic: {title}\n\n" + "\n\n---\n\n".join(
+            clean_text(p["content"]) for p in subthread if "content" in p
+        )
+        emb = embedder.encode(combined, convert_to_numpy=True)
+        emb = normalize(emb)
+        embedding_data.append({
+            "topic_id": tid,
+            "topic_title": title,
+            "root_post_number": root_num,
+            "post_numbers": [p["post_number"] for p in subthread if "post_number" in p],
+            "combined_text": combined
+        })
+        embeddings.append(emb)
+if not embeddings:
+    print("❌ No embeddings were generated. Exiting.")
+    exit()
+embeddings = np.vstack(embeddings).astype("float32")
+# --- Build FAISS index ---
+dim = embeddings.shape[1]
+index = faiss.IndexFlatIP(dim)
+index.add(embeddings)
+print(f"✅ Indexed {len(embedding_data)} subthreads")
+# --- Semantic retrieval ---
+def retrieve(query, top_k=5):
+    q_emb = embedder.encode(query, convert_to_numpy=True)
+    q_emb = normalize(q_emb).astype("float32")
+    D, I = index.search(np.array([q_emb]), top_k)
+    results = []
+    for score, idx in zip(D[0], I[0]):
+        data = embedding_data[idx]
+        results.append({
+            "score": float(score),
+            "topic_id": data["topic_id"],
+            "topic_title": data["topic_title"],
+            "root_post_number": data["root_post_number"],
+            "post_numbers": data["post_numbers"],
+            "combined_text": data["combined_text"],
+        })
+    return results
+# --- QA generation using T5 ---
+gen_model_name = "google/flan-t5-base"
+tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
+qa_model = AutoModelForSeq2SeqLM.from_pretrained(gen_model_name)
+def generate_answer(query, contexts, max_len=256):
+    context = "\n\n".join(contexts)
+    prompt = f"Answer the question based on the following forum discussion:\n\n{context}\n\nQuestion: {query}\nAnswer:"
+    inputs = tokenizer(prompt, return_tensors="pt", max_length=4096, truncation=True)
+    outputs = qa_model.generate(**inputs, max_length=max_len, num_beams=5, early_stopping=True)
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+# --- Run Example ---
+if __name__ == "__main__":
+    query = "If a student scores 10/10 on GA4 as well as a bonus, how would it appear on the dashboard?"
+    results = retrieve(query, top_k=3)
+    print("\n🔍 Top Retrieved Threads:")
+    for i, r in enumerate(results, 1):
+        print(f"\n[{i}] Score: {r['score']:.4f}")
+        print(f"Topic Title: {r['topic_title']}")
+        print(f"Root Post #: {r['root_post_number']} | Post IDs: {r['post_numbers']}")
+        print(f"Snippet:\n{r['combined_text'][:300]}...\n")
+    contexts = [r["combined_text"] for r in results]
+    answer = generate_answer(query, contexts)
+    print("\n💡 Generated Answer:\n", answer)