notionhive-ai commited on
Commit
0b99897
·
verified ·
1 Parent(s): 401748f

Upload 5 files

Browse files
Files changed (5) hide show
  1. faq_routes.py +239 -0
  2. faq_services.py +70 -0
  3. ircc_updater.py +66 -0
  4. main.py +25 -0
  5. requirements.txt +13 -0
faq_routes.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Basic Packages
2
+ import io
3
+ import pandas as pd
4
+ import uuid
5
+ import traceback
6
+ from collections import defaultdict
7
+ import time
8
+ import os
9
+ import json
10
+
11
+ #API Packages
12
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Body, Path, Request
13
+
14
+ #FAQ CSV Validator Package
15
+ from pydantic import BaseModel
16
+
17
+ #Calling Functions from other py files
18
+ from faq_services import gemini_model, db, load_faqs, add_faq_to_csv, faq_path
19
+ from chatbot_prompt import generate_prompt
20
+ from ircc_updater import manual_ircc_update_with_result
21
+
22
+ router = APIRouter()
23
+
24
+ user_question_count = defaultdict(int)
25
+ QUESTION_LIMIT = 3
26
+ WHATSAPP_LINK = "https://wa.me/1234567890"
27
+ GREETING_KEYWORDS = {"hi", "hello", "hey", "good morning", "good evening", "good afternoon", "greetings"}
28
+ QUESTION_LOG_FILE = "question_limit_log.json"
29
+
30
+ # Data validation classes
31
+ class QuestionRequest(BaseModel):
32
+ query: str
33
+
34
+ class FAQItem(BaseModel):
35
+ question: str
36
+ answer: str
37
+
38
+ def is_greeting(text: str) -> bool:
39
+ lower = text.lower().strip()
40
+ return any(greet in lower for greet in GREETING_KEYWORDS) or len(lower) <= 12
41
+
42
+ # Load or initialize question count data
43
+ def load_question_log():
44
+ if not os.path.exists(QUESTION_LOG_FILE):
45
+ return {}
46
+ with open(QUESTION_LOG_FILE, "r") as f:
47
+ return json.load(f)
48
+
49
+ def save_question_log(log_data):
50
+ with open(QUESTION_LOG_FILE, "w") as f:
51
+ json.dump(log_data, f)
52
+
53
+ # In-memory cache loaded on startup
54
+ user_question_count = load_question_log()
55
+
56
+ # Chat endpoint API
57
+ @router.post("/ask")
58
+ async def ask_faq(request: QuestionRequest, http_request: Request):
59
+ ip = http_request.client.host
60
+ query = request.query.strip()
61
+
62
+ count = user_question_count.get(ip, 0)
63
+
64
+ if count >= QUESTION_LIMIT:
65
+ return {
66
+ "message": f"You've reached the free question limit. Please contact us on WhatsApp: {WHATSAPP_LINK}"
67
+ }
68
+
69
+ if not is_greeting(query):
70
+ user_question_count[ip] = count + 1
71
+ save_question_log(user_question_count)
72
+
73
+ if user_question_count[ip] >= QUESTION_LIMIT:
74
+ return {
75
+ "message": f"You've reached the free question limit. Please contact us on WhatsApp: {WHATSAPP_LINK}"
76
+ }
77
+
78
+ # Run search & generate response
79
+ results = db.similarity_search(query, k=3)
80
+ context = "\n\n".join([doc.page_content for doc in results])
81
+ prompt = generate_prompt(context, query)
82
+
83
+ try:
84
+ response = gemini_model.generate_content(prompt)
85
+ return {"answer": response.text.strip()}
86
+ except Exception as e:
87
+ raise HTTPException(status_code=500, detail=str(e))
88
+
89
+ # Add Single FAQ API
90
+ @router.post("/add_faq")
91
+ async def add_faq(faq: FAQItem):
92
+ try:
93
+ df = pd.read_csv(faq_path, encoding="utf-8")
94
+ if ((df["prompt"] == faq.question) & (df["response"] == faq.answer)).any():
95
+ raise HTTPException(status_code=400, detail="FAQ already exists.")
96
+ new_df = pd.DataFrame([{"id": str(uuid.uuid4()), "prompt": faq.question, "response": faq.answer}])
97
+ updated_df = pd.concat([df, new_df], ignore_index=True)
98
+ updated_df.to_csv(faq_path, index=False, encoding="utf-8")
99
+ global db
100
+ db = load_faqs()
101
+ return {"message": "FAQ added successfully."}
102
+ except Exception as e:
103
+ raise HTTPException(status_code=500, detail=str(e))
104
+
105
+ # Upload CSV API
106
+ @router.post("/upload_faqs_csv")
107
+ async def upload_faqs_csv(file: UploadFile = File(...)):
108
+ if not file.filename.endswith(".csv"):
109
+ return {
110
+ "status": "error",
111
+ "message": "Invalid file type",
112
+ "error": "Only CSV files are supported."
113
+ }
114
+
115
+ try:
116
+ contents = await file.read()
117
+ df = pd.read_csv(io.BytesIO(contents))
118
+
119
+ if "question" not in df.columns or "answer" not in df.columns:
120
+ return {
121
+ "status": "error",
122
+ "message": "Invalid CSV structure",
123
+ "error": "CSV must contain 'question' and 'answer' columns."
124
+ }
125
+
126
+ for _, row in df.iterrows():
127
+ question = str(row["question"]).strip()
128
+ answer = str(row["answer"]).strip()
129
+ if question and answer:
130
+ add_faq_to_csv(question, answer)
131
+
132
+ global db
133
+ db = load_faqs()
134
+
135
+ return {
136
+ "status": "success",
137
+ "message": "FAQs uploaded and added successfully."
138
+ }
139
+
140
+ except Exception as e:
141
+ traceback.print_exc()
142
+ return {
143
+ "status": "error",
144
+ "message": "Failed to process CSV",
145
+ "error": str(e)
146
+ }
147
+
148
+ # Delete Single FAQ API
149
+ @router.delete("/delete_faq")
150
+ async def delete_faq(faq: FAQItem = Body(...)):
151
+ try:
152
+ df = pd.read_csv(faq_path, encoding="utf-8")
153
+ filtered_df = df[~((df["prompt"] == faq.question) & (df["response"] == faq.answer))]
154
+ if len(df) == len(filtered_df):
155
+ raise HTTPException(status_code=404, detail="FAQ not found.")
156
+ filtered_df.to_csv(faq_path, index=False, encoding="utf-8")
157
+ global db
158
+ db = load_faqs()
159
+ return {"message": "FAQ deleted successfully."}
160
+ except Exception as e:
161
+ raise HTTPException(status_code=500, detail=str(e))
162
+
163
+ @router.delete("/deleted/{faq_id}")
164
+ async def delete_faq_by_id(faq_id: str = Path(...)):
165
+ try:
166
+ df = pd.read_csv(faq_path, encoding="utf-8")
167
+ if "id" not in df.columns:
168
+ raise HTTPException(status_code=500, detail="CSV does not contain 'id' column.")
169
+
170
+ filtered_df = df[df["id"] != faq_id]
171
+ if len(filtered_df) == len(df):
172
+ raise HTTPException(status_code=404, detail="FAQ with given ID not found.")
173
+
174
+ filtered_df.to_csv(faq_path, index=False, encoding="utf-8")
175
+ global db
176
+ db = load_faqs()
177
+ return {"message": f"FAQ with ID {faq_id} deleted successfully."}
178
+ except Exception as e:
179
+ raise HTTPException(status_code=500, detail=str(e))
180
+
181
+ # Delete All FAQs API
182
+ @router.delete("/delete/destroyall")
183
+ async def delete_all_faqs():
184
+ try:
185
+ pd.DataFrame(columns=["id", "prompt", "response"]).to_csv(faq_path, index=False, encoding="utf-8")
186
+ global db
187
+ db = load_faqs()
188
+ return {"message": "All FAQs deleted successfully."}
189
+ except Exception as e:
190
+ raise HTTPException(status_code=500, detail=str(e))
191
+
192
+ # Show All FAQs API
193
+ @router.get("/get_faqs")
194
+ async def get_faqs():
195
+ try:
196
+ df = pd.read_csv(faq_path, encoding="utf-8")
197
+ df = df.astype(str)
198
+ result = df.rename(columns={"prompt": "question", "response": "answer"}).to_dict(orient="records")
199
+ return result
200
+ except FileNotFoundError:
201
+ raise HTTPException(status_code=404, detail="FAQ CSV file not found.")
202
+ except pd.errors.ParserError as e:
203
+ raise HTTPException(status_code=500, detail=f"CSV Parsing Error: {str(e)}")
204
+ except UnicodeDecodeError as e:
205
+ raise HTTPException(status_code=500, detail=f"Encoding Error: {str(e)}")
206
+ except Exception as e:
207
+ raise HTTPException(status_code=500, detail=f"Unexpected Error: {str(e)}")
208
+
209
+ # Retrain DB
210
+ @router.post("/retrain")
211
+ async def retrain_db():
212
+ try:
213
+ global db
214
+ db = load_faqs()
215
+ return {"message": "Chatbot retrained successfully."}
216
+ except Exception as e:
217
+ raise HTTPException(status_code=500, detail=str(e))
218
+
219
+ @router.get("/update_ircc_faqs")
220
+ async def update_ircc_faqs():
221
+ try:
222
+ added = manual_ircc_update_with_result()
223
+
224
+ # fallback if None
225
+ if not added:
226
+ return {
227
+ "message": "IRCC FAQs updated manually.",
228
+ "added_count": 0,
229
+ "entries": []
230
+ }
231
+
232
+ return {
233
+ "message": "IRCC FAQs updated manually.",
234
+ "added_count": len(added),
235
+ "entries": added
236
+ }
237
+ except Exception as e:
238
+ raise HTTPException(status_code=500, detail=str(e))
239
+
faq_services.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # faq_services.py
2
+ import os
3
+ import pandas as pd
4
+ import uuid
5
+ from dotenv import load_dotenv
6
+
7
+ # LangChain / Vector DB
8
+ from langchain.vectorstores import Milvus
9
+ from langchain.embeddings import SentenceTransformerEmbeddings
10
+ from langchain.docstore.document import Document
11
+ from langchain.document_loaders.csv_loader import CSVLoader
12
+
13
+ # Google Gemini
14
+ import google.generativeai as genai
15
+
16
+ # ---------------------- Environment Setup ----------------------
17
+
18
+ os.environ["HF_HOME"] = "/tmp/hf_cache"
19
+ load_dotenv()
20
+
21
+ api_key = os.getenv("GOOGLE_API_KEY")
22
+ genai.configure(api_key=api_key)
23
+
24
+ gemini_model = genai.GenerativeModel(
25
+ model_name="gemini-2.0-flash",
26
+ generation_config={"temperature": 0.5}
27
+ )
28
+
29
+ # ---------------------- File & Model Config ----------------------
30
+
31
+ faq_path = "faqs.csv"
32
+ embedding_model = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
33
+
34
+ # Zilliz (Milvus) Cloud Config
35
+ milvus_uri = os.getenv("ZILLIZ_URI")
36
+ milvus_token = os.getenv("ZILLIZ_TOKEN")
37
+ collection_name = os.getenv("ZILLIZ_COLLECTION", "visaverse_faqs")
38
+
39
+ # ---------------------- Load FAQ Vector DB ----------------------
40
+
41
+ def load_faqs():
42
+ if not os.path.exists(faq_path):
43
+ pd.DataFrame(columns=["id", "prompt", "response"]).to_csv(faq_path, index=False, encoding="utf-8")
44
+
45
+ loader = CSVLoader(faq_path, encoding="utf-8")
46
+ docs = loader.load()
47
+
48
+ if not docs:
49
+ docs = [Document(page_content="This is a placeholder FAQ")]
50
+
51
+ return Milvus.from_documents(
52
+ documents=docs,
53
+ embedding=embedding_model,
54
+ connection_args={
55
+ "uri": milvus_uri,
56
+ "token": milvus_token,
57
+ },
58
+ collection_name=collection_name,
59
+ )
60
+
61
+ db = load_faqs()
62
+
63
+ # ---------------------- Append New FAQ to CSV ----------------------
64
+
65
+ def add_faq_to_csv(question: str, answer: str):
66
+ df = pd.read_csv(faq_path, encoding="utf-8")
67
+ if not ((df["prompt"] == question) & (df["response"] == answer)).any():
68
+ new_row = pd.DataFrame([{"id": str(uuid.uuid4()), "prompt": question, "response": answer}])
69
+ df = pd.concat([df, new_row], ignore_index=True)
70
+ df.to_csv(faq_path, index=False, encoding="utf-8")
ircc_updater.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from langchain.embeddings import SentenceTransformerEmbeddings
4
+ from faq_services import db
5
+ from apscheduler.schedulers.background import BackgroundScheduler
6
+ from langchain.schema import Document
7
+
8
+ # Config
9
+ embedding_model = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
10
+ ircc_urls = [
11
+ "https://www.canada.ca/en/immigration-refugees-citizenship.html",
12
+ "https://www.canadavisa.com/ircc.html",
13
+ "https://www.canada.ca/en/immigration-refugees-citizenship/services/immigrate-canada.html"
14
+ ]
15
+
16
+ # Scrape IRCC
17
+ def scrape_ircc_content():
18
+ results = []
19
+ for url in ircc_urls:
20
+ try:
21
+ resp = requests.get(url, timeout=10)
22
+ soup = BeautifulSoup(resp.text, "html.parser")
23
+ for tag in soup.find_all(["h2", "h3", "p"]):
24
+ text = tag.get_text(strip=True)
25
+ if len(text) > 50:
26
+ results.append(text)
27
+ except Exception as e:
28
+ print(f"[IRCC] Failed to fetch from {url}: {e}")
29
+ return list(set(results))
30
+
31
+ # Update Vector DB
32
+ def update_ircc_embeddings():
33
+ print("[IRCC] Checking for new IRCC content...")
34
+ texts = scrape_ircc_content()
35
+ new_texts = []
36
+
37
+ for t in texts:
38
+ results = db.similarity_search(t, k=1)
39
+ if not results or results[0].page_content.strip().lower() != t.strip().lower():
40
+ new_texts.append(t)
41
+
42
+ if new_texts:
43
+ # Convert to Document objects
44
+ documents = [Document(page_content=text) for text in new_texts]
45
+ embeddings = embedding_model.embed_documents([doc.page_content for doc in documents])
46
+
47
+ # Add to DB
48
+ db.add_documents(documents=documents, embeddings=embeddings)
49
+ print(f"[IRCC] Added {len(new_texts)} new entries.")
50
+ return new_texts
51
+ else:
52
+ print("[IRCC] No new entries found.")
53
+ return new_texts
54
+
55
+ def manual_ircc_update():
56
+ update_ircc_embeddings()
57
+
58
+ def manual_ircc_update_with_result():
59
+ return update_ircc_embeddings()
60
+
61
+ # Scheduler
62
+ def start_ircc_scheduler():
63
+ scheduler = BackgroundScheduler()
64
+ scheduler.add_job(update_ircc_embeddings, 'interval', days=7)
65
+ scheduler.start()
66
+ print("[IRCC] Scheduler started: checks every 7 days.")
main.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #API Packages
2
+ from fastapi import FastAPI
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+
5
+ #Route Calling From faq_routes.py
6
+ from faq_routes import router as faq_router
7
+ from ircc_updater import start_ircc_scheduler
8
+
9
+ app = FastAPI()
10
+
11
+ # Enable CORS for frontend
12
+ app.add_middleware(
13
+ CORSMiddleware,
14
+ allow_origins=["*"],
15
+ allow_credentials=True,
16
+ allow_methods=["*"],
17
+ allow_headers=["*"],
18
+ )
19
+
20
+ # Include FAQ API routes
21
+ app.include_router(faq_router)
22
+
23
+ @app.on_event("startup")
24
+ async def startup_event():
25
+ start_ircc_scheduler()
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ chromadb==1.0.9
2
+ fastapi==0.115.9
3
+ uvicorn==0.34.2
4
+ google-generativeai==0.8.5
5
+ langchain==0.3.25
6
+ langchain-community==0.3.24
7
+ python-dotenv==1.1.0
8
+ pandas==2.2.3
9
+ python-multipart==0.0.20
10
+ sentence-transformers==4.1.0
11
+ pymilvus
12
+ BeautifulSoup4
13
+ APScheduler