naveen07garg commited on
Commit
8401c82
·
verified ·
1 Parent(s): 5fabc0f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +637 -0
app.py ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import spacy
3
+ import re
4
+ import os, requests, time
5
+ import fitz # PyMuPDF We use PyMuPDF (fitz) to capture hierarchy (section → subsection → subsubsection → content/bullets).
6
+ from collections import Counter
7
+ from fastapi import FastAPI
8
+ from pydantic import BaseModel
9
+ from typing import Optional
10
+
11
+ from langchain_community.vectorstores import Chroma
12
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
13
+
14
+ # --------------------------
15
+ # HR Assistant Prompt Templates
16
+ # --------------------------
17
+
18
+ hr_system_message = """
19
+ You are the Flykite Airlines HR Policy Assistant.
20
+
21
+ Your role is to answer employee questions based on official HR documents (handbooks, policy PDFs, etc.).
22
+ Each user question will start with the token: ###Question.
23
+
24
+ ### Response Rules
25
+ - Be clear, factual, and professional.
26
+ - Use bullet points (-) or numbered lists (1., 2., etc.) for clarity.
27
+ - Begin with a **one-line summary**, then details.
28
+ - Cite the Specific policy references (Document → Section → Subsection → Sub-subsection) where
29
+ the answer comes from.
30
+ - If the answer is not in the source, reply 1 line from generic resonse and post fix with exactly: \n\n **"Could not find anything out from Flyline HR documentation around your query.\n\nPlease rephrase your query."**
31
+ - Do **not** make assumptions or fabricate information.
32
+
33
+ ### Ambiguity & Context
34
+ - If a query could refer to multiple policies or depends on role/location/department, ask **one short clarifying question**.
35
+ - If you assume a context, state it clearly (e.g., "Assuming HQ staff...").
36
+ - When policies differ by role/location, list variations clearly.
37
+
38
+ ### Personalization
39
+ - Tailor responses to any role, location, or employment type provided.
40
+ - Mention if rules vary and what those differences are.
41
+
42
+ ### Format
43
+ 1. One-line summary.
44
+ 2. Key details, steps, or rules.
45
+ 3. Specific policy references (Document → Section → Subsection → Sub-subsection) where
46
+ the answer comes from.
47
+ 4. Optional follow-up suggestion or clarifying question.
48
+
49
+ ### Important
50
+ - Never guess or invent policy content.
51
+ - Maintain confidentiality and avoid personal data.
52
+ - User questions always begin with `###Question`. Respond only to those.
53
+ """
54
+
55
+ hr_user_message_template = """
56
+ Consider the following ###Context and ###Question:
57
+
58
+ ###Context
59
+ {context}
60
+
61
+ ###Question
62
+ {question}
63
+ """
64
+
65
+ # --------------------------
66
+ # PDF Parsing Utils
67
+ # --------------------------
68
+
69
+ def clean_text_hidden(s: str) -> str:
70
+ if not s:
71
+ return ""
72
+ s = re.sub(r"[\u200B-\u200F\u202A-\u202E\u00A0\u00AD]", " ", s)
73
+ s = re.sub(r"\s+", " ", s)
74
+ return s.strip()
75
+
76
+ def is_line_fully_bold(spans):
77
+ return all(
78
+ ("Bold" in s["font"] or s["flags"] & 2 != 0)
79
+ for s in spans if s.get("text", "").strip()
80
+ )
81
+
82
+ def detect_font_levels(pdf_path):
83
+ doc = fitz.open(pdf_path)
84
+ font_sizes = []
85
+ for page in doc:
86
+ blocks = page.get_text("dict")["blocks"]
87
+ for b in blocks:
88
+ for l in b.get("lines", []):
89
+ for s in l.get("spans", []):
90
+ font_sizes.append(round(s["size"], 1))
91
+ unique_sizes = sorted(set(font_sizes), reverse=True)
92
+ if len(unique_sizes) > 3:
93
+ candidate_sizes = unique_sizes[1:-1]
94
+ else:
95
+ candidate_sizes = unique_sizes
96
+ section_size = candidate_sizes[0] if candidate_sizes else unique_sizes[0]
97
+ subsubsection_size = candidate_sizes[1] if len(candidate_sizes) > 1 else section_size
98
+ return section_size, subsubsection_size
99
+
100
+ def most_common_size(sizes):
101
+ return Counter(sizes).most_common(1)[0][0] if sizes else None
102
+
103
+ def parse_flykite(pdf_path):
104
+ section_size, subsubsection_size = detect_font_levels(pdf_path)
105
+ doc = fitz.open(pdf_path)
106
+ sections = []
107
+ current_section, current_subsection, current_subsubsection = None, None, None
108
+
109
+ for page_num, page in enumerate(doc, start=1):
110
+ blocks = page.get_text("dict")["blocks"]
111
+ for b in blocks:
112
+ for l in b.get("lines", []):
113
+ spans = l.get("spans", [])
114
+ line_text = "".join(s.get("text", "") for s in spans).strip()
115
+ line_text = clean_text_hidden(line_text)
116
+ if not line_text:
117
+ continue
118
+ span_sizes = [round(s["size"], 1) for s in spans]
119
+ line_size = most_common_size(span_sizes)
120
+
121
+ # SECTION/SUBSECTION
122
+ if line_size == section_size:
123
+ if is_line_fully_bold(spans) and "policy" in line_text.lower():
124
+ current_subsection = {"subsection": line_text, "subsubsections": [], "content": []}
125
+ if current_section:
126
+ current_section["subsections"].append(current_subsection)
127
+ else:
128
+ current_section = {"section": line_text, "subsections": []}
129
+ sections.append(current_section)
130
+ current_subsection = None
131
+ current_subsubsection = None
132
+ continue
133
+
134
+ # SUB-SUBSECTION
135
+ if re.match(r"^\d+\s*\.\s+", line_text):
136
+ if line_size == subsubsection_size:
137
+ is_heading = False
138
+ if is_line_fully_bold(spans):
139
+ is_heading = True
140
+ else:
141
+ if len(spans) > 1:
142
+ first_span_text = clean_text_hidden(spans[0]["text"]).strip()
143
+ if re.match(r"^\d+\.?$", first_span_text):
144
+ rest_bold = all(
145
+ ("Bold" in s["font"] or s["flags"] & 2 != 0)
146
+ for s in spans[1:] if s.get("text", "").strip()
147
+ )
148
+ if rest_bold:
149
+ is_heading = True
150
+ if is_heading:
151
+ current_subsubsection = {"title": line_text, "content": []}
152
+ if current_subsection:
153
+ current_subsection["subsubsections"].append(current_subsubsection)
154
+ elif current_section:
155
+ auto_sub = {"subsection": current_section["section"], "subsubsections": []}
156
+ current_section["subsections"].append(auto_sub)
157
+ current_subsection = auto_sub
158
+ current_subsection["subsubsections"].append(current_subsubsection)
159
+ continue
160
+ # otherwise treat as content
161
+ if current_subsubsection:
162
+ current_subsubsection["content"].append(line_text)
163
+ elif current_subsection:
164
+ current_subsection["content"].append(line_text)
165
+ elif current_section:
166
+ current_section.setdefault("content", []).append(line_text)
167
+ else:
168
+ if not sections:
169
+ sections.append({"intro": [line_text]})
170
+ else:
171
+ sections[0].setdefault("intro", []).append(line_text)
172
+ return sections
173
+
174
+
175
+
176
+ # (REST calls, no LangChain-OpenAI).
177
+ class SimpleChat:
178
+ def __init__(self, model="gpt-4o-mini"):
179
+ self.model = model
180
+ self.api_key = os.getenv("OPENAI_API_KEY")
181
+ self.base_url = "https://api.openai.com/v1/chat/completions"
182
+
183
+ def invoke(self, messages, temperature=0, max_tokens=1500):
184
+ resp = requests.post(
185
+ self.base_url,
186
+ headers={"Authorization": f"Bearer {self.api_key}"},
187
+ json={
188
+ "model": self.model,
189
+ "messages": messages,
190
+ "temperature": temperature,
191
+ "max_tokens": max_tokens
192
+ }
193
+ )
194
+ resp.raise_for_status()
195
+ return resp.json()["choices"][0]["message"]["content"].strip()
196
+
197
+
198
+
199
+ # --------------------------
200
+ # Chunking + RAG
201
+ # --------------------------
202
+
203
+
204
+ # ADDED section_title & subsection_title alongside subsubsection_titLes into each chunk,
205
+ # so that any Chunk as it gets embedded
206
+ # >>>> It should have reference of the Parent level Section/Subsetion Titles information , in particular , as well ,
207
+ # >>>> Just in case , some End User says something at the level of Section Level mapped information.
208
+
209
+ # Secondly this helps to Increase trust and compliance by citing sources (document name, section, subsection, subsubsection as well) for each response.
210
+
211
+ # --- Flatten JSON to chunks ---
212
+ # Load spaCy NER model
213
+ nlp = spacy.load("en_core_web_sm")
214
+
215
+ # --- spaCy Extraction ---
216
+ def extract_with_spacy(text):
217
+ doc = nlp(text)
218
+ roles, locations, departments = [], [], []
219
+
220
+ for ent in doc.ents:
221
+ if ent.label_ in ["GPE", "LOC"]: # e.g., "Singapore"
222
+ locations.append(ent.text)
223
+ elif ent.label_ in ["ORG"]: # e.g., "HR", "Finance"
224
+ departments.append(ent.text)
225
+ elif ent.label_ in ["PERSON"]: # sometimes job titles slip
226
+ roles.append(ent.text)
227
+
228
+ return {
229
+ "roles": list(set(roles)),
230
+ "locations": list(set(locations)),
231
+ "departments": list(set(departments))
232
+ }
233
+
234
+ # --- LLM Extraction ---
235
+ def extract_with_llm(text):
236
+ prompt = f"""
237
+ You are an expert HR assistant for an airline company.
238
+
239
+ Your Task:
240
+ - Extract **Role(s)**, **Location(s)**, and **Department(s)** explicitly or implicitly mentioned
241
+ in the following HR policy text.
242
+ - Focus on aviation-related roles (e.g., Pilot, Cabin Crew, Engineer, Ground Staff, Field Staff),
243
+ locations (e.g., India, UK, Singapore, Headquarters), and departments (e.g., HR, Finance, Compliance, Operations).
244
+ - If something is implied (e.g., "field staff" → role=Field Staff, location unspecified), capture it.
245
+ - If no information is found, return an empty list for that field.
246
+
247
+
248
+
249
+ ---
250
+ ### FEW SHOTS Examples
251
+ Text: "Special leave for cabin crew in Singapore"
252
+ Output: {{"roles": ["Cabin Crew"], "locations": ["Singapore"], "departments": []}}
253
+
254
+ Text: "Pilots based in UK headquarters"
255
+ Output: {{"roles": ["Pilot"], "locations": ["United Kingdom", "Headquarters"], "departments": []}}
256
+
257
+ Text: "HR staff policies in India"
258
+ Output: {{"roles": [], "locations": ["India"], "departments": ["HR"]}}
259
+
260
+ Text: "Field staff in Dubai get separate insurance policy"
261
+ Output: {{"roles": ["Field Staff"], "locations": ["Dubai"], "departments": []}}
262
+
263
+ ---
264
+ Now extract from:
265
+ {text}
266
+
267
+ Output:
268
+ Return only valid JSON in this exact schema:
269
+ {{
270
+ "roles": [list of roles],
271
+ "locations": [list of locations],
272
+ "departments": [list of departments]
273
+ }}
274
+
275
+ """
276
+
277
+ try:
278
+ # (REST calls, no LangChain-OpenAI).
279
+ os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
280
+ llm = SimpleChat(model="gpt-4o-mini")
281
+ messages = [
282
+ {"role": "user", "content": prompt}
283
+ ]
284
+ content = llm.invoke(messages, temperature=0, max_tokens=1500)
285
+
286
+ # Enforce safe parsing
287
+ if content.startswith("{"):
288
+ extracted = json.loads(content)
289
+ else:
290
+ extracted = {"roles": [], "locations": [], "departments": []}
291
+
292
+ except Exception:
293
+ print("NOT ABLE TO RESOLVE LLM CALL XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
294
+ extracted = {"roles": [], "locations": [], "departments": []}
295
+
296
+ return extracted
297
+
298
+
299
+ # --- Merge spaCy + LLM ---
300
+ def enrich_metadata(text):
301
+ spacy_res = extract_with_spacy(text)
302
+ llm_res = extract_with_llm(text)
303
+ return {
304
+ "roles": list(set(spacy_res["roles"] + llm_res["roles"])),
305
+ "locations": list(set(spacy_res["locations"] + llm_res["locations"])),
306
+ "departments": list(set(spacy_res["departments"] + llm_res["departments"]))
307
+ }
308
+
309
+ # --- Ensure metadata is Chroma-compatible ---
310
+ def sanitize_metadata(meta: dict) -> dict:
311
+ safe_meta = {}
312
+ for k, v in meta.items():
313
+ if isinstance(v, (str, int, float, bool)) or v is None:
314
+ safe_meta[k] = v
315
+ elif isinstance(v, (list, tuple)):
316
+ safe_meta[k] = ", ".join(map(str, v)) # flatten lists
317
+ elif isinstance(v, dict):
318
+ safe_meta[k] = json.dumps(v, ensure_ascii=False) # dict → string
319
+ else:
320
+ safe_meta[k] = str(v) # fallback
321
+ return safe_meta
322
+
323
+
324
+
325
+ # --- Flatten JSON to chunks ---
326
+ def flatten_json_to_chunks(structured_json, document_name="Flykite HR Policy Handbook"):
327
+ chunks = []
328
+ for sec in structured_json:
329
+ section_title = sec.get("section")
330
+ for sub in sec.get("subsections", []):
331
+ subsection_title = sub.get("subsection")
332
+
333
+ # Sub-subsections
334
+ for subsub in sub.get("subsubsections", []):
335
+ content_text = " ".join(subsub.get("content", []))
336
+ if content_text.strip():
337
+ enriched_meta = enrich_metadata(content_text)
338
+ meta = sanitize_metadata({
339
+ "document": document_name,
340
+ "section": section_title,
341
+ "subsection": subsection_title,
342
+ "subsubsection": subsub.get("title"),
343
+ **enriched_meta
344
+ })
345
+ chunks.append({
346
+ "text": f"{section_title} | {subsection_title} | {subsub.get('title')}\n\n{content_text}",
347
+ "metadata": meta
348
+ })
349
+
350
+ # Fallback: orphaned content under subsection
351
+ if sub.get("content"):
352
+ content_text = " ".join(sub.get("content", []))
353
+ enriched_meta = enrich_metadata(content_text)
354
+ meta = sanitize_metadata({
355
+ "document": document_name,
356
+ "section": section_title,
357
+ "subsection": subsection_title,
358
+ "subsubsection": "", # None, : Chroma doesn’t allow None values. They must be strings (or removed),
359
+ **enriched_meta
360
+ })
361
+ chunks.append({
362
+ "text": f"{section_title} | {subsection_title}\n\n{content_text}",
363
+ "metadata": meta
364
+ })
365
+
366
+ # Fallback: orphaned content under section
367
+ if sec.get("content"):
368
+ content_text = " ".join(sec.get("content", []))
369
+ enriched_meta = enrich_metadata(content_text)
370
+ meta = sanitize_metadata({
371
+ "document": document_name,
372
+ "section": section_title,
373
+ "subsection": "", # None, : Chroma doesn’t allow None values. They must be strings (or removed),
374
+ "subsubsection": "", # None, : Chroma doesn’t allow None values. They must be strings (or removed),
375
+ **enriched_meta
376
+ })
377
+ chunks.append({
378
+ "text": f"{section_title}\n\n{content_text}",
379
+ "metadata": meta
380
+ })
381
+ return chunks
382
+
383
+
384
+
385
+
386
+ def build_context(docs):
387
+ context_parts = []
388
+ for d in docs:
389
+ meta = d.metadata
390
+ citation = f"{meta.get('document')} → {meta.get('section')}"
391
+ if meta.get("subsection"):
392
+ citation += f" / {meta.get('subsection')}"
393
+ if meta.get("subsubsection"):
394
+ citation += f" / {meta.get('subsubsection')}"
395
+ context_parts.append(f"Source: {citation}\n{d.page_content}")
396
+ return "\n\n---\n\n".join(context_parts)
397
+
398
+
399
+
400
+ # -----------------------
401
+ # User Query Enrichment
402
+ # -----------------------
403
+ def extract_metadata_from_query(query: str):
404
+ """Use spaCy + LLM to extract role/location/department from user query."""
405
+ spacy_res = extract_with_spacy(query)
406
+ print("spaCy results ## ==>", spacy_res)
407
+ llm_res = extract_with_llm(query)
408
+ print("LLM Extraction Results ## ==>", llm_res)
409
+
410
+ return {
411
+ "roles": list(set(spacy_res["roles"] + llm_res["roles"])),
412
+ "locations": list(set(spacy_res["locations"] + llm_res["locations"])),
413
+ "departments": list(set(spacy_res["departments"] + llm_res["departments"]))
414
+ }
415
+
416
+
417
+ # -----------------------
418
+ # Helper: Filter docs manually
419
+ # -----------------------
420
+ def filter_docs_by_metadata(docs, metadata_filters):
421
+ filtered = []
422
+ for d in docs:
423
+ meta = d.metadata
424
+ keep = True
425
+ if metadata_filters.get("roles"):
426
+ keep &= any(r in meta.get("roles", []) for r in metadata_filters["roles"])
427
+ if metadata_filters.get("locations"):
428
+ keep &= any(l in meta.get("locations", []) for l in metadata_filters["locations"])
429
+ if metadata_filters.get("departments"):
430
+ keep &= any(dep in meta.get("departments", []) for dep in metadata_filters["departments"])
431
+ if keep:
432
+ filtered.append(d)
433
+ return filtered
434
+
435
+
436
+
437
+ def generate_rag_response(user_input, retriever, k=3, max_tokens=1500):
438
+
439
+ # relevant_docs = retriever.get_relevant_documents(user_input)[:k]
440
+
441
+ # When user asks a query, we enrich it by extracting role, location, department using the same spaCy + LLM pipeline.
442
+ # Pass those extracted values as filters to the retriever → only chunks with matching metadata are considered.
443
+ # If nothing matches, fallback to plain semantic search (so we don’t block valid answers).
444
+
445
+ # Step 1: Extract personalization metadata from query
446
+ query_metadata = extract_metadata_from_query(user_input)
447
+
448
+ print("\n======================")
449
+ print(" User Query:", user_input)
450
+ print(" Extracted metadata from query:", query_metadata) # Investigatory log
451
+
452
+
453
+ # 2. Retrieve top-k docs semantically
454
+ retrieved_docs = retriever.get_relevant_documents(user_input, k=k)
455
+ print(f" Retrieved {len(retrieved_docs)} docs before filtering")
456
+
457
+ # 3. Apply metadata filtering
458
+ filtered_docs = filter_docs_by_metadata(retrieved_docs, query_metadata)
459
+ if filtered_docs:
460
+ selected_docs = filtered_docs
461
+ print(f"✅ {len(selected_docs)} docs kept after metadata filtering")
462
+ else:
463
+ selected_docs = retrieved_docs # fallback if no metadata match
464
+ print("⚠️ No metadata match, falling back to semantic retrieval only")
465
+
466
+
467
+ # Step 4: Log retrieved docs metadata
468
+ print(f"✅ Retrieved {len(selected_docs)} docs")
469
+ for i, d in enumerate(selected_docs, 1):
470
+ print(f"\n--- Chunk {i} ---")
471
+ print("Text:", d.page_content[:200], "...") # preview first 200 chars
472
+ print("Metadata:", d.metadata)
473
+
474
+
475
+ context_for_query = build_context(selected_docs)
476
+ user_prompt = hr_user_message_template.format(context=context_for_query, question=user_input)
477
+
478
+ messages = [
479
+ {"role": "system", "content": hr_system_message},
480
+ {"role": "user", "content": user_prompt},
481
+ ]
482
+
483
+ #llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=max_tokens)
484
+ #response = llm.invoke(messages)
485
+ #return {"answer": response.content, "sources": [d.metadata for d in relevant_docs]}
486
+ # You still used ChatOpenAI (from langchain-openai) for generating answers.
487
+ # That’s where the proxies keyword issue blew up, since that part was still using the buggy client.
488
+ # Error: your container is pulling in a version of langchain-openai (and maybe openai)
489
+ # that still tries to pass proxies to the OpenAI client, but in your current environment the client doesn’t accept that argument.
490
+
491
+ # (REST calls, no LangChain-OpenAI).
492
+ os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
493
+ llm = SimpleChat(model="gpt-4o-mini")
494
+ answer = llm.invoke(messages, temperature=0, max_tokens=max_tokens)
495
+ return {"answer": answer, "sources": [d.metadata for d in selected_docs]}
496
+
497
+
498
+
499
+ # --------------------------
500
+ # FastAPI App
501
+ # --------------------------
502
+
503
+ #--================== START of API setup on reboot =====================
504
+ app = FastAPI()
505
+ persist_dir = "./flykite_chromadb"
506
+ retriever = None
507
+
508
+ class QueryRequest(BaseModel):
509
+ query: str
510
+ top_k: Optional[int] = 3
511
+
512
+ #@app.on_event("startup")
513
+ #def startup_event():
514
+ #global retriever
515
+ time.sleep(2) # ✅ give Hugging Face time to inject secrets
516
+ print("🔑 OPENAI_API_KEY loaded:", bool(os.getenv("OPENAI_API_KEY")))
517
+ pdf_path = "data/Dataset-FlykiteAirlines_HRP.pdf" #Place PDF IN the repo Boot
518
+
519
+ # Parse PDF → JSON
520
+ parsed_data = parse_flykite(pdf_path)
521
+ print(json.dumps(parsed_data[:1], indent=2, ensure_ascii=False))
522
+
523
+ if not parsed_data:
524
+ raise RuntimeError(" Parsed JSON is empty, cannot build chunks/vectorstore")
525
+
526
+ # Flatten chunks
527
+ chunks = flatten_json_to_chunks(parsed_data)
528
+ print(f" Loaded {len(chunks)} chunks from JSON")
529
+
530
+ # If no chunks, fail early
531
+ if not chunks:
532
+ raise RuntimeError("No chunks generated from structured JSON")
533
+
534
+
535
+ # Build Chroma vectorstore
536
+ # Define SimpleEmbeddings inline
537
+ os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
538
+ class SimpleEmbeddings:
539
+ def __init__(self, model="text-embedding-3-small"):
540
+ self.model = model
541
+ self.api_key = os.getenv("OPENAI_API_KEY")
542
+ self.base_url = "https://api.openai.com/v1/embeddings"
543
+
544
+ def embed_documents(self, texts):
545
+ embeddings = []
546
+ for text in texts:
547
+ resp = requests.post(
548
+ self.base_url,
549
+ headers={"Authorization": f"Bearer {self.api_key}"},
550
+ json={"model": self.model, "input": text}
551
+ )
552
+ resp.raise_for_status()
553
+ embeddings.append(resp.json()["data"][0]["embedding"])
554
+ return embeddings
555
+
556
+ def embed_query(self, query):
557
+ resp = requests.post(
558
+ self.base_url,
559
+ headers={"Authorization": f"Bearer {self.api_key}"},
560
+ json={"model": self.model, "input": query}
561
+ )
562
+ resp.raise_for_status()
563
+ return resp.json()["data"][0]["embedding"]
564
+
565
+
566
+ # Use SimpleEmbeddings instead of OpenAIEmbeddings
567
+ embedding = SimpleEmbeddings(model="text-embedding-3-small")
568
+
569
+ texts = [c["text"] for c in chunks]
570
+ metadatas = [c["metadata"] for c in chunks]
571
+
572
+ vectorstore = Chroma.from_texts(
573
+ texts=texts,
574
+ embedding=embedding,
575
+ metadatas=metadatas,
576
+ persist_directory=persist_dir,
577
+ ids=[f"chunk_{i}" for i in range(len(chunks))]
578
+
579
+ )
580
+
581
+ vectorstore.persist() #ensure data is saved to disk
582
+
583
+ print("💾 Chroma vectorstore saved !!")
584
+
585
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
586
+ print(" PDF parsed, chunks embedded, retriever initialized.")
587
+
588
+ #--================== END of API setup on start =====================
589
+ #@app.post("/query")
590
+ #def query_endpoint(req: QueryRequest):
591
+ # return generate_rag_response(req.query, retriever, k=req.top_k)
592
+
593
+
594
+
595
+
596
+ def wait_for_key(key_name="OPENAI_API_KEY", timeout=10):
597
+ for _ in range(timeout):
598
+ if os.getenv(key_name):
599
+ print(f"✅ {key_name} available.")
600
+ return True
601
+ print(f"⏳ Waiting for {key_name}...")
602
+ time.sleep(1)
603
+ print(f"❌ {key_name} not found after {timeout} seconds.")
604
+ return False
605
+
606
+ # =============================
607
+ # Step 5: Chat Function
608
+ # =============================
609
+
610
+ def format_answer(result):
611
+ answer = result["answer"]
612
+ sources = result.get("sources", [])
613
+
614
+ formatted_sources = "\n".join([
615
+ f"- {s['document']} → {s['section']} / {s['subsection']} / {s['subsubsection']}"
616
+ for s in sources
617
+ ])
618
+
619
+ return f"""{answer}
620
+
621
+ 📄 **Sources**
622
+ {formatted_sources}
623
+ """
624
+
625
+ def chat_fn(message, history):
626
+ global retriever
627
+ wait_for_key()
628
+ if retriever is None:
629
+ return "⚠️ Retriever not initialized. Please rebuild or check vector DB."
630
+ answer = generate_rag_response(message, retriever)
631
+ return format_answer(answer) #f"{answer}\n\n🧠 (Context retrieved from {pdf_path})"
632
+
633
+ @app.post("/chat")
634
+ def chat_api(req: ChatRequest):
635
+ response = chat_fn(req.message, req.history)
636
+ return {"response": response}
637
+