selva1909 commited on
Commit
5a71086
·
verified ·
1 Parent(s): 308f1a3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +211 -0
app.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ import json
4
+ import hashlib
5
+ import shutil
6
+ from io import BytesIO
7
+ from typing import List, Tuple
8
+
9
+ import gradio as gr
10
+ import numpy as np
11
+ import faiss
12
+ import requests
13
+ from sentence_transformers import SentenceTransformer
14
+ import fitz # PyMuPDF
15
+
16
+ # ---------------- Config ----------------
17
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
18
+ OPENROUTER_MODEL = "nvidia/nemotron-nano-12b-v2-vl:free"
19
+ EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
20
+ CACHE_DIR = "./cache"
21
+ SYSTEM_PROMPT = "You are a helpful assistant."
22
+
23
+ os.makedirs(CACHE_DIR, exist_ok=True)
24
+
25
+ embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
26
+
27
+ DOCS: List[str] = []
28
+ FILENAMES: List[str] = []
29
+ EMBEDDINGS: np.ndarray = None
30
+ FAISS_INDEX = None
31
+ CURRENT_CACHE_KEY: str = ""
32
+
33
+
34
+ # ---------------- Periodic cache cleanup ----------------
35
+ async def clear_cache_every_5min():
36
+ while True:
37
+ await asyncio.sleep(300)
38
+ try:
39
+ if os.path.exists(CACHE_DIR):
40
+ shutil.rmtree(CACHE_DIR)
41
+ os.makedirs(CACHE_DIR, exist_ok=True)
42
+ print("🧹 Cache cleared.")
43
+ except Exception as e:
44
+ print(f"[Cache cleanup error] {e}")
45
+
46
+ asyncio.get_event_loop().create_task(clear_cache_every_5min())
47
+
48
+
49
+ # ---------------- PDF extraction ----------------
50
+ def extract_text_from_pdf(file_bytes: bytes) -> str:
51
+ try:
52
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
53
+ return "\n".join(page.get_text() for page in doc)
54
+ except Exception as e:
55
+ return f"[PDF extraction error] {e}"
56
+
57
+
58
+ # ---------------- Cache + FAISS helpers ----------------
59
+ def make_cache_key(files: List[Tuple[str, bytes]]) -> str:
60
+ h = hashlib.sha256()
61
+ for name, b in sorted(files, key=lambda x: x[0]):
62
+ h.update(name.encode())
63
+ h.update(str(len(b)).encode())
64
+ h.update(hashlib.sha256(b).digest())
65
+ return h.hexdigest()
66
+
67
+ def cache_save(cache_key: str, embeddings: np.ndarray, filenames: List[str]):
68
+ np.savez_compressed(os.path.join(CACHE_DIR, f"{cache_key}.npz"),
69
+ embeddings=embeddings, filenames=np.array(filenames))
70
+
71
+ def cache_load(cache_key: str):
72
+ path = os.path.join(CACHE_DIR, f"{cache_key}.npz")
73
+ if not os.path.exists(path): return None
74
+ try:
75
+ data = np.load(path, allow_pickle=True)
76
+ return data["embeddings"], data["filenames"].tolist()
77
+ except:
78
+ return None
79
+
80
+ def build_faiss(emb: np.ndarray):
81
+ global FAISS_INDEX
82
+ if emb is None or len(emb) == 0:
83
+ FAISS_INDEX = None
84
+ return None
85
+ emb = emb.astype("float32")
86
+ index = faiss.IndexFlatL2(emb.shape[1])
87
+ index.add(emb)
88
+ FAISS_INDEX = index
89
+ return index
90
+
91
+ def search(query: str, k: int = 3):
92
+ if FAISS_INDEX is None:
93
+ return []
94
+ q_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")
95
+ D, I = FAISS_INDEX.search(q_emb, k)
96
+ return [
97
+ {"index": int(i), "distance": float(d), "text": DOCS[i], "source": FILENAMES[i]}
98
+ for d, i in zip(D[0], I[0]) if i >= 0
99
+ ]
100
+
101
+
102
+ # ---------------- OpenRouter API ----------------
103
+ def call_openrouter(prompt: str):
104
+ if not OPENROUTER_API_KEY:
105
+ return "[OpenRouter error] Missing OPENROUTER_API_KEY."
106
+
107
+ url = "https://openrouter.ai/api/v1/chat/completions"
108
+ headers = {
109
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
110
+ "Content-Type": "application/json",
111
+ }
112
+
113
+ payload = {
114
+ "model": OPENROUTER_MODEL,
115
+ "messages": [
116
+ {"role": "system",
117
+ "content": SYSTEM_PROMPT + " Always respond in plain text. Avoid markdown."},
118
+ {"role": "user", "content": prompt},
119
+ ],
120
+ }
121
+
122
+ try:
123
+ r = requests.post(url, headers=headers, json=payload, timeout=60)
124
+ r.raise_for_status()
125
+ obj = r.json()
126
+
127
+ if "choices" in obj and obj["choices"]:
128
+ text = obj["choices"][0]["message"]["content"]
129
+ return text.strip().replace("```", "")
130
+ return "[Unexpected OpenRouter response]"
131
+ except Exception as e:
132
+ return f"[OpenRouter request error] {e}"
133
+
134
+
135
+ # ---------------- PDF Upload & Index ----------------
136
+ def upload_and_index(files):
137
+ global DOCS, FILENAMES, EMBEDDINGS, CURRENT_CACHE_KEY
138
+
139
+ if not files:
140
+ return "No PDF uploaded.", ""
141
+
142
+ processed = []
143
+ for f in files:
144
+ name = os.path.basename(f.name)
145
+ b = f.read()
146
+ processed.append((name, b))
147
+
148
+ preview = [{"name": n, "size": len(b)} for n, b in processed]
149
+
150
+ cache_key = make_cache_key(processed)
151
+ CURRENT_CACHE_KEY = cache_key
152
+
153
+ cached = cache_load(cache_key)
154
+ if cached:
155
+ EMBEDDINGS, FILENAMES = cached
156
+ EMBEDDINGS = np.array(EMBEDDINGS)
157
+ DOCS = [extract_text_from_pdf(b) for _, b in processed]
158
+ build_faiss(EMBEDDINGS)
159
+ return f"Loaded cached embeddings ({len(FILENAMES)} PDFs).", json.dumps(preview)
160
+
161
+ DOCS = [extract_text_from_pdf(b) for _, b in processed]
162
+ FILENAMES = [n for n, _ in processed]
163
+
164
+ EMBEDDINGS = embedder.encode(DOCS, convert_to_numpy=True).astype("float32")
165
+ cache_save(cache_key, EMBEDDINGS, FILENAMES)
166
+ build_faiss(EMBEDDINGS)
167
+
168
+ return f"Uploaded + indexed {len(DOCS)} PDFs.", json.dumps(preview)
169
+
170
+
171
+ # ---------------- Question Answering ----------------
172
+ def ask(question: str):
173
+ if not question:
174
+ return "Please enter a question."
175
+ if not DOCS:
176
+ return "No PDFs indexed."
177
+
178
+ results = search(question)
179
+
180
+ if not results:
181
+ return "No relevant text found."
182
+
183
+ context = "\n".join(
184
+ f"Source: {r['source']}\n\n{r['text'][:15000]}\n---\n"
185
+ for r in results
186
+ )
187
+
188
+ prompt = f"Use this context to answer briefly:\n\n{context}\nQuestion: {question}\nAnswer:"
189
+ return call_openrouter(prompt)
190
+
191
+
192
+ # ---------------- Gradio UI ----------------
193
+ with gr.Blocks(title="PDF RAG Bot") as demo:
194
+ gr.Markdown("# 📄 PDF-Only RAG Bot\nUpload PDFs → Ask Questions → AI Answers from PDF content.")
195
+
196
+ file_input = gr.File(label="Upload PDF files", file_count="multiple", file_types=[".pdf"])
197
+ upload_btn = gr.Button("Upload & Index")
198
+ status = gr.Textbox(label="Status", interactive=False)
199
+ preview = gr.Textbox(label="Upload preview (JSON)", interactive=False)
200
+
201
+ upload_btn.click(upload_and_index, inputs=[file_input], outputs=[status, preview])
202
+
203
+ gr.Markdown("### Ask a Question")
204
+ q = gr.Textbox(label="Your question", lines=3)
205
+ ask_btn = gr.Button("Ask PDF Bot")
206
+ answer = gr.Textbox(label="Answer", lines=15)
207
+
208
+ ask_btn.click(ask, inputs=[q], outputs=[answer])
209
+
210
+ if __name__ == "__main__":
211
+ demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)