Spaces:

samsonleegh
/

bro_wedding_helper

Runtime error

App Files Files Community

samsonleegh commited on Oct 21, 2025

Commit

f9aabb7

verified ·

1 Parent(s): 9ba51f7

Update app_webhook.py

Browse files

Files changed (1) hide show

app_webhook.py +33 -27

app_webhook.py CHANGED Viewed

@@ -1,4 +1,6 @@
-import os, glob, hashlib
 import numpy as np
 import pandas as pd
 from typing import List, Tuple
@@ -12,13 +14,14 @@ import faiss
 from telegram import Update
 from telegram.ext import Application, CommandHandler, MessageHandler, ContextTypes, AIORateLimiter, filters
 load_dotenv()
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
-PUBLIC_URL = os.getenv("PUBLIC_URL", "")  # e.g. https://username-space.hf.space
 OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
 EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
-STRICT_DOC_MODE = (os.getenv("STRICT_DOC_MODE", "true").lower() == "true")
 DOCS_DIR = os.getenv("DOCS_DIR", "wedding_docs")
 INDEX_PATH = os.getenv("INDEX_PATH", "wedding.index")
 META_CSV = os.getenv("META_CSV", "wedding_chunks.csv")
@@ -26,17 +29,17 @@ META_CSV = os.getenv("META_CSV", "wedding_chunks.csv")
 client = OpenAI(api_key=OPENAI_API_KEY)
 # ---------- Doc loaders ----------
 def read_txt_md(path: str) -> str:
-    return open(path, "r", encoding="utf-8", errors="ignore").read()
 def read_docx(path: str) -> str:
     doc = DocxDocument(path)
-    return " ".join(p.text for p in doc.paragraphs)
 def read_pdf(path: str) -> str:
     reader = PdfReader(path)
-    return " ".join((p.extract_text() or "") for p in reader.pages)
 def load_all_docs(folder: str) -> List[Tuple[str, str]]:
     paths = []
@@ -57,13 +60,12 @@ def load_all_docs(folder: str) -> List[Tuple[str, str]]:
     return docs
 # ---------- Index ----------
-def chunk_text(text: str, source: str, chunk_size: int = 300, overlap: int = 50):
     words = text.split()
     i = 0
     while i < len(words):
-        chunk = " ".join(words[i:i+chunk_size])
-        yield {"source": source, "chunk": chunk, "hash": hashlib.md5((source+str(i)).encode()).hexdigest()}
         i += (chunk_size - overlap)
 def embed_texts(texts: list[str]) -> np.ndarray:
@@ -83,6 +85,7 @@ class RAGIndex:
             raise RuntimeError(f"No docs in {DOCS_DIR}/")
         index_exists = os.path.exists(INDEX_PATH) and os.path.exists(META_CSV)
         need = force or not index_exists
         if index_exists and not need:
             df = pd.read_csv(META_CSV)
             vecs = np.load(INDEX_PATH)
@@ -91,7 +94,8 @@ class RAGIndex:
             idx.add(vecs)
             self.index, self.df, self.dim = idx, df, vecs.shape[1]
             return
-        # build
         chunks = []
         for p, t in docs:
             for c in chunk_text(t, p):
@@ -105,7 +109,7 @@ class RAGIndex:
         idx.add(vecs)
         self.index, self.df, self.dim = idx, df, vecs.shape[1]
-    def retrieve(self, q: str, k=10):
         qv = embed_texts([q])
         faiss.normalize_L2(qv)
         D, I = self.index.search(qv, k)
@@ -120,30 +124,29 @@ RAG = RAGIndex()
 SYSTEM_PROMPT = (
     "You are a concise wedding assistant for Samson’s brother’s wedding. "
-    "Use ONLY the provided context. If missing, say so and suggest contacting Overall IC. Keep answers under 150 words."
 )
 async def answer_with_rag(q: str) -> str:
-    ctx = RAG.retrieve(q, k=10)
     blocks = []
     for r in ctx:
         t = r["chunk"]
         if len(t) > 800:
             t = t[:800] + "…"
-        blocks.append(f"[Source: {os.path.basename(r['source'])}]\n{t}")  # use \n
-    context_text = "\n\n".join(blocks)  # proper delimiter
     completion = client.chat.completions.create(
         model=OPENAI_MODEL,
         messages=[
             {"role": "system", "content": SYSTEM_PROMPT},
-            {"role": "user", "content": f"Context from docs:\n\n{context_text}\n\nQuestion: {q}"},
         ],
         temperature=0.2,
     )
     a = completion.choices[0].message.content.strip()
     if STRICT_DOC_MODE and not blocks:
         return (
             "I couldn’t find this in the docs. Please check the playbook or ask the Overall IC. "
@@ -159,17 +162,21 @@ async def start_telegram():
     global telegram_app
     if telegram_app is not None:
         return telegram_app
     RAG.load_or_build(force=False)
-    application = Application.builder() \
-        .token(TELEGRAM_BOT_TOKEN) \
-        .rate_limiter(AIORateLimiter()) \
         .build()
     async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
         await update.message.reply_text(
-            "Hello! Ask me anything about roles, timings, addresses, and logistics.
-"
-            "Admins can use /refresh after updating docs.")
     async def help_cmd(update: Update, context: ContextTypes.DEFAULT_TYPE):
         await update.message.reply_text("Use /refresh or just ask your question in plain text.")
@@ -213,4 +220,3 @@ async def telegram_webhook(token: str, request: Request):
     update = Update.de_json(data, (await start_telegram()).bot)
     await (await start_telegram()).process_update(update)
     return JSONResponse({"ok": True})

+import os
+import glob
+import hashlib
 import numpy as np
 import pandas as pd
 from typing import List, Tuple
 from telegram import Update
 from telegram.ext import Application, CommandHandler, MessageHandler, ContextTypes, AIORateLimiter, filters
+# ---------- Load environment ----------
 load_dotenv()
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
+PUBLIC_URL = os.getenv("PUBLIC_URL", "")  # e.g., https://username-space.hf.space
 OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
 EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
+STRICT_DOC_MODE = os.getenv("STRICT_DOC_MODE", "true").lower() == "true"
 DOCS_DIR = os.getenv("DOCS_DIR", "wedding_docs")
 INDEX_PATH = os.getenv("INDEX_PATH", "wedding.index")
 META_CSV = os.getenv("META_CSV", "wedding_chunks.csv")
 client = OpenAI(api_key=OPENAI_API_KEY)
 # ---------- Doc loaders ----------
 def read_txt_md(path: str) -> str:
+    with open(path, "r", encoding="utf-8", errors="ignore") as f:
+        return f.read()
 def read_docx(path: str) -> str:
     doc = DocxDocument(path)
+    return "\n".join(p.text for p in doc.paragraphs)
 def read_pdf(path: str) -> str:
     reader = PdfReader(path)
+    return "\n".join((p.extract_text() or "") for p in reader.pages)
 def load_all_docs(folder: str) -> List[Tuple[str, str]]:
     paths = []
     return docs
 # ---------- Index ----------
+def chunk_text(text: str, source: str, chunk_size: int = 350, overlap: int = 50):
     words = text.split()
     i = 0
     while i < len(words):
+        chunk = " ".join(words[i:i + chunk_size])
+        yield {"source": source, "chunk": chunk, "hash": hashlib.md5((source + str(i)).encode()).hexdigest()}
         i += (chunk_size - overlap)
 def embed_texts(texts: list[str]) -> np.ndarray:
             raise RuntimeError(f"No docs in {DOCS_DIR}/")
         index_exists = os.path.exists(INDEX_PATH) and os.path.exists(META_CSV)
         need = force or not index_exists
         if index_exists and not need:
             df = pd.read_csv(META_CSV)
             vecs = np.load(INDEX_PATH)
             idx.add(vecs)
             self.index, self.df, self.dim = idx, df, vecs.shape[1]
             return
+        # build new index
         chunks = []
         for p, t in docs:
             for c in chunk_text(t, p):
         idx.add(vecs)
         self.index, self.df, self.dim = idx, df, vecs.shape[1]
+    def retrieve(self, q: str, k=6):
         qv = embed_texts([q])
         faiss.normalize_L2(qv)
         D, I = self.index.search(qv, k)
 SYSTEM_PROMPT = (
     "You are a concise wedding assistant for Samson’s brother’s wedding. "
+    "Use ONLY the provided context. If missing, say so and suggest contacting Overall IC. "
+    "Keep answers under 150 words."
 )
 async def answer_with_rag(q: str) -> str:
+    ctx = RAG.retrieve(q, k=6)
     blocks = []
     for r in ctx:
         t = r["chunk"]
         if len(t) > 800:
             t = t[:800] + "…"
+        blocks.append(f"[Source: {os.path.basename(r['source'])}]\n{t}")
+    context_text = "\n\n".join(blocks)
     completion = client.chat.completions.create(
         model=OPENAI_MODEL,
         messages=[
             {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": f"Context from docs:\n\n{context_text}\n\nQuestion: {q}"}
         ],
         temperature=0.2,
     )
     a = completion.choices[0].message.content.strip()
     if STRICT_DOC_MODE and not blocks:
         return (
             "I couldn’t find this in the docs. Please check the playbook or ask the Overall IC. "
     global telegram_app
     if telegram_app is not None:
         return telegram_app
     RAG.load_or_build(force=False)
+    application = (
+        Application.builder()
+        .token(TELEGRAM_BOT_TOKEN)
+        .rate_limiter(AIORateLimiter())
         .build()
+    )
     async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
         await update.message.reply_text(
+            "Hello! Ask me anything about roles, timings, addresses, and logistics.\n"
+            "Admins can use /refresh after updating docs."
+        )
     async def help_cmd(update: Update, context: ContextTypes.DEFAULT_TYPE):
         await update.message.reply_text("Use /refresh or just ask your question in plain text.")
     update = Update.de_json(data, (await start_telegram()).bot)
     await (await start_telegram()).process_update(update)
     return JSONResponse({"ok": True})