samsonleegh commited on
Commit
f9aabb7
·
verified ·
1 Parent(s): 9ba51f7

Update app_webhook.py

Browse files
Files changed (1) hide show
  1. app_webhook.py +33 -27
app_webhook.py CHANGED
@@ -1,4 +1,6 @@
1
- import os, glob, hashlib
 
 
2
  import numpy as np
3
  import pandas as pd
4
  from typing import List, Tuple
@@ -12,13 +14,14 @@ import faiss
12
  from telegram import Update
13
  from telegram.ext import Application, CommandHandler, MessageHandler, ContextTypes, AIORateLimiter, filters
14
 
 
15
  load_dotenv()
16
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
17
  TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
18
- PUBLIC_URL = os.getenv("PUBLIC_URL", "") # e.g. https://username-space.hf.space
19
  OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
20
  EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
21
- STRICT_DOC_MODE = (os.getenv("STRICT_DOC_MODE", "true").lower() == "true")
22
  DOCS_DIR = os.getenv("DOCS_DIR", "wedding_docs")
23
  INDEX_PATH = os.getenv("INDEX_PATH", "wedding.index")
24
  META_CSV = os.getenv("META_CSV", "wedding_chunks.csv")
@@ -26,17 +29,17 @@ META_CSV = os.getenv("META_CSV", "wedding_chunks.csv")
26
  client = OpenAI(api_key=OPENAI_API_KEY)
27
 
28
  # ---------- Doc loaders ----------
29
-
30
  def read_txt_md(path: str) -> str:
31
- return open(path, "r", encoding="utf-8", errors="ignore").read()
 
32
 
33
  def read_docx(path: str) -> str:
34
  doc = DocxDocument(path)
35
- return " ".join(p.text for p in doc.paragraphs)
36
 
37
  def read_pdf(path: str) -> str:
38
  reader = PdfReader(path)
39
- return " ".join((p.extract_text() or "") for p in reader.pages)
40
 
41
  def load_all_docs(folder: str) -> List[Tuple[str, str]]:
42
  paths = []
@@ -57,13 +60,12 @@ def load_all_docs(folder: str) -> List[Tuple[str, str]]:
57
  return docs
58
 
59
  # ---------- Index ----------
60
-
61
- def chunk_text(text: str, source: str, chunk_size: int = 300, overlap: int = 50):
62
  words = text.split()
63
  i = 0
64
  while i < len(words):
65
- chunk = " ".join(words[i:i+chunk_size])
66
- yield {"source": source, "chunk": chunk, "hash": hashlib.md5((source+str(i)).encode()).hexdigest()}
67
  i += (chunk_size - overlap)
68
 
69
  def embed_texts(texts: list[str]) -> np.ndarray:
@@ -83,6 +85,7 @@ class RAGIndex:
83
  raise RuntimeError(f"No docs in {DOCS_DIR}/")
84
  index_exists = os.path.exists(INDEX_PATH) and os.path.exists(META_CSV)
85
  need = force or not index_exists
 
86
  if index_exists and not need:
87
  df = pd.read_csv(META_CSV)
88
  vecs = np.load(INDEX_PATH)
@@ -91,7 +94,8 @@ class RAGIndex:
91
  idx.add(vecs)
92
  self.index, self.df, self.dim = idx, df, vecs.shape[1]
93
  return
94
- # build
 
95
  chunks = []
96
  for p, t in docs:
97
  for c in chunk_text(t, p):
@@ -105,7 +109,7 @@ class RAGIndex:
105
  idx.add(vecs)
106
  self.index, self.df, self.dim = idx, df, vecs.shape[1]
107
 
108
- def retrieve(self, q: str, k=10):
109
  qv = embed_texts([q])
110
  faiss.normalize_L2(qv)
111
  D, I = self.index.search(qv, k)
@@ -120,30 +124,29 @@ RAG = RAGIndex()
120
 
121
  SYSTEM_PROMPT = (
122
  "You are a concise wedding assistant for Samson’s brother’s wedding. "
123
- "Use ONLY the provided context. If missing, say so and suggest contacting Overall IC. Keep answers under 150 words."
 
124
  )
125
 
126
  async def answer_with_rag(q: str) -> str:
127
- ctx = RAG.retrieve(q, k=10)
128
  blocks = []
129
  for r in ctx:
130
  t = r["chunk"]
131
  if len(t) > 800:
132
  t = t[:800] + "…"
133
- blocks.append(f"[Source: {os.path.basename(r['source'])}]\n{t}") # use \n
134
-
135
- context_text = "\n\n".join(blocks) # proper delimiter
136
 
 
137
  completion = client.chat.completions.create(
138
  model=OPENAI_MODEL,
139
  messages=[
140
  {"role": "system", "content": SYSTEM_PROMPT},
141
- {"role": "user", "content": f"Context from docs:\n\n{context_text}\n\nQuestion: {q}"},
142
  ],
143
  temperature=0.2,
144
  )
145
  a = completion.choices[0].message.content.strip()
146
-
147
  if STRICT_DOC_MODE and not blocks:
148
  return (
149
  "I couldn’t find this in the docs. Please check the playbook or ask the Overall IC. "
@@ -159,17 +162,21 @@ async def start_telegram():
159
  global telegram_app
160
  if telegram_app is not None:
161
  return telegram_app
 
162
  RAG.load_or_build(force=False)
163
- application = Application.builder() \
164
- .token(TELEGRAM_BOT_TOKEN) \
165
- .rate_limiter(AIORateLimiter()) \
 
 
166
  .build()
 
167
 
168
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
169
  await update.message.reply_text(
170
- "Hello! Ask me anything about roles, timings, addresses, and logistics.
171
- "
172
- "Admins can use /refresh after updating docs.")
173
 
174
  async def help_cmd(update: Update, context: ContextTypes.DEFAULT_TYPE):
175
  await update.message.reply_text("Use /refresh or just ask your question in plain text.")
@@ -213,4 +220,3 @@ async def telegram_webhook(token: str, request: Request):
213
  update = Update.de_json(data, (await start_telegram()).bot)
214
  await (await start_telegram()).process_update(update)
215
  return JSONResponse({"ok": True})
216
-
 
1
+ import os
2
+ import glob
3
+ import hashlib
4
  import numpy as np
5
  import pandas as pd
6
  from typing import List, Tuple
 
14
  from telegram import Update
15
  from telegram.ext import Application, CommandHandler, MessageHandler, ContextTypes, AIORateLimiter, filters
16
 
17
+ # ---------- Load environment ----------
18
  load_dotenv()
19
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
20
  TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
21
+ PUBLIC_URL = os.getenv("PUBLIC_URL", "") # e.g., https://username-space.hf.space
22
  OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
23
  EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
24
+ STRICT_DOC_MODE = os.getenv("STRICT_DOC_MODE", "true").lower() == "true"
25
  DOCS_DIR = os.getenv("DOCS_DIR", "wedding_docs")
26
  INDEX_PATH = os.getenv("INDEX_PATH", "wedding.index")
27
  META_CSV = os.getenv("META_CSV", "wedding_chunks.csv")
 
29
  client = OpenAI(api_key=OPENAI_API_KEY)
30
 
31
  # ---------- Doc loaders ----------
 
32
  def read_txt_md(path: str) -> str:
33
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
34
+ return f.read()
35
 
36
  def read_docx(path: str) -> str:
37
  doc = DocxDocument(path)
38
+ return "\n".join(p.text for p in doc.paragraphs)
39
 
40
  def read_pdf(path: str) -> str:
41
  reader = PdfReader(path)
42
+ return "\n".join((p.extract_text() or "") for p in reader.pages)
43
 
44
  def load_all_docs(folder: str) -> List[Tuple[str, str]]:
45
  paths = []
 
60
  return docs
61
 
62
  # ---------- Index ----------
63
+ def chunk_text(text: str, source: str, chunk_size: int = 350, overlap: int = 50):
 
64
  words = text.split()
65
  i = 0
66
  while i < len(words):
67
+ chunk = " ".join(words[i:i + chunk_size])
68
+ yield {"source": source, "chunk": chunk, "hash": hashlib.md5((source + str(i)).encode()).hexdigest()}
69
  i += (chunk_size - overlap)
70
 
71
  def embed_texts(texts: list[str]) -> np.ndarray:
 
85
  raise RuntimeError(f"No docs in {DOCS_DIR}/")
86
  index_exists = os.path.exists(INDEX_PATH) and os.path.exists(META_CSV)
87
  need = force or not index_exists
88
+
89
  if index_exists and not need:
90
  df = pd.read_csv(META_CSV)
91
  vecs = np.load(INDEX_PATH)
 
94
  idx.add(vecs)
95
  self.index, self.df, self.dim = idx, df, vecs.shape[1]
96
  return
97
+
98
+ # build new index
99
  chunks = []
100
  for p, t in docs:
101
  for c in chunk_text(t, p):
 
109
  idx.add(vecs)
110
  self.index, self.df, self.dim = idx, df, vecs.shape[1]
111
 
112
+ def retrieve(self, q: str, k=6):
113
  qv = embed_texts([q])
114
  faiss.normalize_L2(qv)
115
  D, I = self.index.search(qv, k)
 
124
 
125
  SYSTEM_PROMPT = (
126
  "You are a concise wedding assistant for Samson’s brother’s wedding. "
127
+ "Use ONLY the provided context. If missing, say so and suggest contacting Overall IC. "
128
+ "Keep answers under 150 words."
129
  )
130
 
131
  async def answer_with_rag(q: str) -> str:
132
+ ctx = RAG.retrieve(q, k=6)
133
  blocks = []
134
  for r in ctx:
135
  t = r["chunk"]
136
  if len(t) > 800:
137
  t = t[:800] + "…"
138
+ blocks.append(f"[Source: {os.path.basename(r['source'])}]\n{t}")
 
 
139
 
140
+ context_text = "\n\n".join(blocks)
141
  completion = client.chat.completions.create(
142
  model=OPENAI_MODEL,
143
  messages=[
144
  {"role": "system", "content": SYSTEM_PROMPT},
145
+ {"role": "user", "content": f"Context from docs:\n\n{context_text}\n\nQuestion: {q}"}
146
  ],
147
  temperature=0.2,
148
  )
149
  a = completion.choices[0].message.content.strip()
 
150
  if STRICT_DOC_MODE and not blocks:
151
  return (
152
  "I couldn’t find this in the docs. Please check the playbook or ask the Overall IC. "
 
162
  global telegram_app
163
  if telegram_app is not None:
164
  return telegram_app
165
+
166
  RAG.load_or_build(force=False)
167
+
168
+ application = (
169
+ Application.builder()
170
+ .token(TELEGRAM_BOT_TOKEN)
171
+ .rate_limiter(AIORateLimiter())
172
  .build()
173
+ )
174
 
175
  async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
176
  await update.message.reply_text(
177
+ "Hello! Ask me anything about roles, timings, addresses, and logistics.\n"
178
+ "Admins can use /refresh after updating docs."
179
+ )
180
 
181
  async def help_cmd(update: Update, context: ContextTypes.DEFAULT_TYPE):
182
  await update.message.reply_text("Use /refresh or just ask your question in plain text.")
 
220
  update = Update.de_json(data, (await start_telegram()).bot)
221
  await (await start_telegram()).process_update(update)
222
  return JSONResponse({"ok": True})