Spaces:
Runtime error
Runtime error
Upload 4 files
Browse files- chatbot_prompt.py +1 -25
- faq_routes.py +1 -49
- faq_services.py +79 -5
- ircc_updater.py +4 -1
chatbot_prompt.py
CHANGED
|
@@ -1,26 +1,2 @@
|
|
| 1 |
def generate_prompt(context: str, query: str) -> str:
|
| 2 |
-
return f"""
|
| 3 |
-
|
| 4 |
-
When a user greets you, kindly introduce yourself as Noah, VisaVerse’s AI assistant. Avoid any unnecessary greetings or random greetings. Do not greet everytime.
|
| 5 |
-
|
| 6 |
-
For all visa, immigration, or VisaVerse-related questions, first try to find answers from the provided FAQs.
|
| 7 |
-
|
| 8 |
-
If the answer to a VisaVerse-related question is not found in the FAQs and the question is critical or detailed, you may perform a web search limited to content related to https://visaverse.ca to find accurate and relevant information. If you still cannot answer, politely suggest visiting the website for more information.
|
| 9 |
-
|
| 10 |
-
For basic or general non-technical questions (such as “What is a visa?”, “What are the types of study permits?”, “How long does a tourist visa last in general?”, and many more), you are allowed to use web search to provide a brief, factual answer — even if it's not in the VisaVerse FAQ.
|
| 11 |
-
|
| 12 |
-
Do not answer personal, medical, financial, or legal advice-based questions from the internet. In those cases, refer the user to VisaVerse's site.
|
| 13 |
-
|
| 14 |
-
Avoid using technical terms or programming-related jargon unless directly relevant.
|
| 15 |
-
|
| 16 |
-
Always ensure responses are clear, factual, and aligned with VisaVerse’s professional tone.
|
| 17 |
-
|
| 18 |
-
When an FAQ-based answer is simply "yes" or "no", rephrase it naturally into a full sentence (e.g., instead of "Yes", say "Yes, you can apply for a visa extension").
|
| 19 |
-
|
| 20 |
-
Never fabricate information. If something isn’t covered, politely say it isn’t currently available and suggest visiting the VisaVerse website.
|
| 21 |
-
Use the following context to answer the user's question:
|
| 22 |
-
|
| 23 |
-
{context}
|
| 24 |
-
|
| 25 |
-
User Question: {query}
|
| 26 |
-
Answer:"""
|
|
|
|
| 1 |
def generate_prompt(context: str, query: str) -> str:
|
| 2 |
+
return f"""Context:\n{context}\n\nUser Question: {query}\nAnswer:"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
faq_routes.py
CHANGED
|
@@ -21,11 +21,6 @@ from ircc_updater import manual_ircc_update_with_result
|
|
| 21 |
|
| 22 |
router = APIRouter()
|
| 23 |
|
| 24 |
-
user_question_count = defaultdict(int)
|
| 25 |
-
QUESTION_LIMIT = 3
|
| 26 |
-
WHATSAPP_LINK = "https://wa.me/+8801712922233"
|
| 27 |
-
GREETING_KEYWORDS = {"hi", "hello", "hey", "good morning", "good evening", "good afternoon", "greetings"}
|
| 28 |
-
QUESTION_LOG_FILE = "question_limit_log.json"
|
| 29 |
|
| 30 |
# Data validation classes
|
| 31 |
class QuestionRequest(BaseModel):
|
|
@@ -35,53 +30,10 @@ class FAQItem(BaseModel):
|
|
| 35 |
question: str
|
| 36 |
answer: str
|
| 37 |
|
| 38 |
-
def is_greeting(text: str) -> bool:
|
| 39 |
-
lower = text.lower().strip()
|
| 40 |
-
return any(greet in lower for greet in GREETING_KEYWORDS) or len(lower) <= 12
|
| 41 |
-
|
| 42 |
-
def load_question_log():
|
| 43 |
-
if not os.path.exists(QUESTION_LOG_FILE):
|
| 44 |
-
return {}
|
| 45 |
-
with open(QUESTION_LOG_FILE, "r") as f:
|
| 46 |
-
return json.load(f)
|
| 47 |
-
|
| 48 |
-
def save_question_log(log_data):
|
| 49 |
-
with open(QUESTION_LOG_FILE, "w") as f:
|
| 50 |
-
json.dump(log_data, f)
|
| 51 |
-
|
| 52 |
-
user_question_count = load_question_log()
|
| 53 |
-
|
| 54 |
-
def get_client_ip(request: Request) -> str:
|
| 55 |
-
x_forwarded_for = request.headers.get("x-forwarded-for")
|
| 56 |
-
if x_forwarded_for:
|
| 57 |
-
return x_forwarded_for.split(",")[0].strip()
|
| 58 |
-
elif request.client and request.client.host:
|
| 59 |
-
return request.client.host
|
| 60 |
-
|
| 61 |
-
# Raise error if no IP found
|
| 62 |
-
raise HTTPException(status_code=400, detail="Unable to determine client IP address.")
|
| 63 |
-
|
| 64 |
@router.post("/ask")
|
| 65 |
-
async def ask_faq(request: QuestionRequest
|
| 66 |
-
ip = get_client_ip(http_request)
|
| 67 |
query = request.query.strip()
|
| 68 |
|
| 69 |
-
count = user_question_count.get(ip, 0)
|
| 70 |
-
|
| 71 |
-
if count >= QUESTION_LIMIT:
|
| 72 |
-
return {
|
| 73 |
-
"answer": f"For more information. Please contact us on WhatsApp: {WHATSAPP_LINK}"
|
| 74 |
-
}
|
| 75 |
-
|
| 76 |
-
if not is_greeting(query):
|
| 77 |
-
user_question_count[ip] = count + 1
|
| 78 |
-
save_question_log(user_question_count)
|
| 79 |
-
|
| 80 |
-
if user_question_count[ip] >= QUESTION_LIMIT:
|
| 81 |
-
return {
|
| 82 |
-
"answer": f"For more information. Please contact us on WhatsApp: {WHATSAPP_LINK}"
|
| 83 |
-
}
|
| 84 |
-
|
| 85 |
results = db.similarity_search(query, k=3)
|
| 86 |
context = "\n\n".join([doc.page_content for doc in results])
|
| 87 |
prompt = generate_prompt(context, query)
|
|
|
|
| 21 |
|
| 22 |
router = APIRouter()
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# Data validation classes
|
| 26 |
class QuestionRequest(BaseModel):
|
|
|
|
| 30 |
question: str
|
| 31 |
answer: str
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
@router.post("/ask")
|
| 34 |
+
async def ask_faq(request: QuestionRequest):
|
|
|
|
| 35 |
query = request.query.strip()
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
results = db.similarity_search(query, k=3)
|
| 38 |
context = "\n\n".join([doc.page_content for doc in results])
|
| 39 |
prompt = generate_prompt(context, query)
|
faq_services.py
CHANGED
|
@@ -11,6 +11,9 @@ from langchain.chat_models import ChatOpenAI
|
|
| 11 |
from langchain.schema import HumanMessage
|
| 12 |
from langchain.docstore.document import Document
|
| 13 |
from langchain_community.document_loaders import CSVLoader
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# ---------------------- Environment Setup ----------------------
|
| 16 |
|
|
@@ -20,12 +23,45 @@ os.environ["HF_HOME"] = "/tmp/hf_cache" # Optional cleanup
|
|
| 20 |
# ---------------------- File & Model Config ----------------------
|
| 21 |
|
| 22 |
faq_path = "faqs.csv"
|
| 23 |
-
embedding_model = OpenAIEmbeddings(
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# Zilliz (Milvus) Cloud Config
|
| 26 |
milvus_uri = os.getenv("ZILLIZ_URI")
|
| 27 |
milvus_token = os.getenv("ZILLIZ_TOKEN")
|
| 28 |
-
collection_name = os.getenv("ZILLIZ_COLLECTION", "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# ---------------------- Load FAQ Vector DB ----------------------
|
| 31 |
|
|
@@ -33,6 +69,14 @@ def load_faqs():
|
|
| 33 |
if not os.path.exists(faq_path):
|
| 34 |
pd.DataFrame(columns=["id", "prompt", "response"]).to_csv(faq_path, index=False, encoding="utf-8")
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
loader = CSVLoader(faq_path, encoding="utf-8")
|
| 37 |
docs = loader.load()
|
| 38 |
|
|
@@ -49,17 +93,46 @@ def load_faqs():
|
|
| 49 |
collection_name=collection_name,
|
| 50 |
)
|
| 51 |
|
|
|
|
| 52 |
db = load_faqs()
|
| 53 |
|
| 54 |
# ---------------------- LLM Wrapper ----------------------
|
| 55 |
|
| 56 |
def ask_openai(prompt: str) -> str:
|
| 57 |
chat = ChatOpenAI(
|
| 58 |
-
model_name="gpt-4o",
|
| 59 |
temperature=0.5,
|
| 60 |
openai_api_key=os.getenv("OPENAI_API_KEY")
|
| 61 |
)
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
# ---------------------- Append New FAQ to CSV ----------------------
|
| 65 |
|
|
@@ -69,4 +142,5 @@ def add_faq_to_csv(question: str, answer: str):
|
|
| 69 |
new_row = pd.DataFrame([{"id": str(uuid.uuid4()), "prompt": question, "response": answer}])
|
| 70 |
df = pd.concat([df, new_row], ignore_index=True)
|
| 71 |
df.to_csv(faq_path, index=False, encoding="utf-8")
|
| 72 |
-
|
|
|
|
|
|
| 11 |
from langchain.schema import HumanMessage
|
| 12 |
from langchain.docstore.document import Document
|
| 13 |
from langchain_community.document_loaders import CSVLoader
|
| 14 |
+
from langchain.schema import SystemMessage, HumanMessage
|
| 15 |
+
from pymilvus import connections, utility, Collection
|
| 16 |
+
from pymilvus.orm.schema import FieldSchema
|
| 17 |
|
| 18 |
# ---------------------- Environment Setup ----------------------
|
| 19 |
|
|
|
|
| 23 |
# ---------------------- File & Model Config ----------------------
|
| 24 |
|
| 25 |
faq_path = "faqs.csv"
|
| 26 |
+
embedding_model = OpenAIEmbeddings(
|
| 27 |
+
model="text-embedding-3-small",
|
| 28 |
+
openai_api_key=os.getenv("OPENAI_API_KEY")
|
| 29 |
+
)
|
| 30 |
|
| 31 |
# Zilliz (Milvus) Cloud Config
|
| 32 |
milvus_uri = os.getenv("ZILLIZ_URI")
|
| 33 |
milvus_token = os.getenv("ZILLIZ_TOKEN")
|
| 34 |
+
collection_name = os.getenv("ZILLIZ_COLLECTION", "visaverse_faqs3")
|
| 35 |
+
|
| 36 |
+
connections.connect(
|
| 37 |
+
alias="default",
|
| 38 |
+
uri=os.getenv("ZILLIZ_URI"),
|
| 39 |
+
token=os.getenv("ZILLIZ_TOKEN")
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
def ensure_collection_matches_schema(expected_dim: int, collection_name: str, uri: str, token: str):
|
| 43 |
+
try:
|
| 44 |
+
connections.connect(uri=uri, token=token)
|
| 45 |
+
|
| 46 |
+
if utility.has_collection(collection_name):
|
| 47 |
+
schema = Collection(collection_name).schema
|
| 48 |
+
for field in schema.fields:
|
| 49 |
+
if field.dtype.name == "FLOAT_VECTOR":
|
| 50 |
+
if field.params and "dim" in field.params:
|
| 51 |
+
actual_dim = int(field.params["dim"])
|
| 52 |
+
if actual_dim != expected_dim:
|
| 53 |
+
print(f"Collection '{collection_name}' has dim {actual_dim}, expected {expected_dim}. Dropping it.")
|
| 54 |
+
utility.drop_collection(collection_name)
|
| 55 |
+
return
|
| 56 |
+
else:
|
| 57 |
+
print(f"Collection '{collection_name}' has correct dimension: {expected_dim}.")
|
| 58 |
+
return
|
| 59 |
+
print(f"Could not find vector field in collection '{collection_name}'. Dropping for safety.")
|
| 60 |
+
utility.drop_collection(collection_name)
|
| 61 |
+
else:
|
| 62 |
+
print(f"Collection '{collection_name}' does not exist. It will be created.")
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"Failed to validate or drop collection: {e}")
|
| 65 |
|
| 66 |
# ---------------------- Load FAQ Vector DB ----------------------
|
| 67 |
|
|
|
|
| 69 |
if not os.path.exists(faq_path):
|
| 70 |
pd.DataFrame(columns=["id", "prompt", "response"]).to_csv(faq_path, index=False, encoding="utf-8")
|
| 71 |
|
| 72 |
+
# Check collection schema
|
| 73 |
+
ensure_collection_matches_schema(
|
| 74 |
+
expected_dim=1536,
|
| 75 |
+
collection_name=collection_name,
|
| 76 |
+
uri=milvus_uri,
|
| 77 |
+
token=milvus_token
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
loader = CSVLoader(faq_path, encoding="utf-8")
|
| 81 |
docs = loader.load()
|
| 82 |
|
|
|
|
| 93 |
collection_name=collection_name,
|
| 94 |
)
|
| 95 |
|
| 96 |
+
|
| 97 |
db = load_faqs()
|
| 98 |
|
| 99 |
# ---------------------- LLM Wrapper ----------------------
|
| 100 |
|
| 101 |
def ask_openai(prompt: str) -> str:
|
| 102 |
chat = ChatOpenAI(
|
| 103 |
+
model_name="gpt-4o",
|
| 104 |
temperature=0.5,
|
| 105 |
openai_api_key=os.getenv("OPENAI_API_KEY")
|
| 106 |
)
|
| 107 |
+
|
| 108 |
+
system_msg = SystemMessage(content="""
|
| 109 |
+
You are Noah, an intelligent and friendly virtual assistant by VisaVerse. Your primary responsibility is to assist users by answering their questions accurately and concisely based on the official VisaVerse FAQs and the website: https://visaverse.ca.
|
| 110 |
+
|
| 111 |
+
Behavior rules:
|
| 112 |
+
- When a user greets you, kindly introduce yourself as Noah, VisaVerse’s AI assistant.
|
| 113 |
+
- Do NOT greet every time.
|
| 114 |
+
- Do NOT greet more than once in a session.
|
| 115 |
+
- Avoid unnecessary greetings or restating your name repeatedly.
|
| 116 |
+
|
| 117 |
+
Answering rules:
|
| 118 |
+
- For visa, immigration, or VisaVerse-related questions, first try to answer using the provided FAQ context.
|
| 119 |
+
- If the FAQ does not contain the answer, and the question is detailed or important, you may use a web search (only limited to content from https://visaverse.ca).
|
| 120 |
+
- For basic/general visa-related questions, you may answer briefly even if not in the FAQ.
|
| 121 |
+
- If a question is unregistered in the FAQ database or you are unable to answer it confidently, say: "This query is not currently addressed in VisaVerse and IRCC database. For further assistance, please contact our support team at team@visaverse.ca."
|
| 122 |
+
|
| 123 |
+
Additional rules:
|
| 124 |
+
- Never answer personal, legal, financial, or medical questions — always refer users to the official VisaVerse site.
|
| 125 |
+
- Never guess or fabricate information. If unsure, suggest visiting the VisaVerse website or contacting support.
|
| 126 |
+
- Always use clear, neutral, and professional tone.
|
| 127 |
+
- Avoid technical or programming language unless explicitly relevant.
|
| 128 |
+
- If the answer is just “yes” or “no”, rewrite it as a full, natural sentence.
|
| 129 |
+
""")
|
| 130 |
+
|
| 131 |
+
return chat([
|
| 132 |
+
system_msg,
|
| 133 |
+
HumanMessage(content=prompt)
|
| 134 |
+
]).content.strip()
|
| 135 |
+
|
| 136 |
|
| 137 |
# ---------------------- Append New FAQ to CSV ----------------------
|
| 138 |
|
|
|
|
| 142 |
new_row = pd.DataFrame([{"id": str(uuid.uuid4()), "prompt": question, "response": answer}])
|
| 143 |
df = pd.concat([df, new_row], ignore_index=True)
|
| 144 |
df.to_csv(faq_path, index=False, encoding="utf-8")
|
| 145 |
+
|
| 146 |
+
|
ircc_updater.py
CHANGED
|
@@ -8,7 +8,10 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
|
| 8 |
import os
|
| 9 |
|
| 10 |
# Config
|
| 11 |
-
embedding_model = OpenAIEmbeddings(
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
ircc_urls = [
|
| 14 |
"https://www.canada.ca/en/immigration-refugees-citizenship.html",
|
|
|
|
| 8 |
import os
|
| 9 |
|
| 10 |
# Config
|
| 11 |
+
embedding_model = OpenAIEmbeddings(
|
| 12 |
+
model="text-embedding-3-small",
|
| 13 |
+
openai_api_key=os.getenv("OPENAI_API_KEY")
|
| 14 |
+
)
|
| 15 |
|
| 16 |
ircc_urls = [
|
| 17 |
"https://www.canada.ca/en/immigration-refugees-citizenship.html",
|