Spaces:
Runtime error
Runtime error
Commit ·
a33c0b6
1
Parent(s): 728520f
Added: Free Embedding
Browse files- .env +1 -1
- faq_services.py +18 -9
- ircc_updater.py +2 -7
- requirements.txt +4 -0
.env
CHANGED
|
@@ -2,4 +2,4 @@ GOOGLE_API_KEY=AIzaSyBA4RZNkKsqwkRrMCPbio7zrQ4xo12XpHI
|
|
| 2 |
ZILLIZ_URI=https://in03-1da78c9c65effba.serverless.aws-eu-central-1.cloud.zilliz.com
|
| 3 |
ZILLIZ_TOKEN=02a96317810ef222f0752a53bd9a3a2d0e46740ccf85f8ec4d18a4427bface8e82d57735f54fb0592802ed883573aeee40e9f529
|
| 4 |
ZILLIZ_COLLECTION=visaverse_faqs
|
| 5 |
-
OPENAI_API_KEY=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
|
|
|
|
| 2 |
ZILLIZ_URI=https://in03-1da78c9c65effba.serverless.aws-eu-central-1.cloud.zilliz.com
|
| 3 |
ZILLIZ_TOKEN=02a96317810ef222f0752a53bd9a3a2d0e46740ccf85f8ec4d18a4427bface8e82d57735f54fb0592802ed883573aeee40e9f529
|
| 4 |
ZILLIZ_COLLECTION=visaverse_faqs
|
| 5 |
+
OPENAI_API_KEY=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
|
faq_services.py
CHANGED
|
@@ -14,6 +14,7 @@ from langchain.schema import HumanMessage
|
|
| 14 |
from langchain.docstore.document import Document
|
| 15 |
from langchain_community.document_loaders import CSVLoader
|
| 16 |
from langchain.schema import SystemMessage, HumanMessage
|
|
|
|
| 17 |
import difflib
|
| 18 |
from pymilvus import connections, utility, Collection
|
| 19 |
from pymilvus.orm.schema import FieldSchema
|
|
@@ -26,10 +27,18 @@ os.environ["HF_HOME"] = "/tmp/hf_cache" # Optional cleanup
|
|
| 26 |
# ---------------------- File & Model Config ----------------------
|
| 27 |
|
| 28 |
faq_path = "faqs.csv"
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
IMPORTANT_KEYWORDS = [
|
| 35 |
"visa", "permanent residency", "PR", "study permit", "work permit", "immigration",
|
|
@@ -94,16 +103,16 @@ def ensure_collection_matches_schema(expected_dim: int, collection_name: str, ur
|
|
| 94 |
if field.params and "dim" in field.params:
|
| 95 |
actual_dim = int(field.params["dim"])
|
| 96 |
if actual_dim != expected_dim:
|
| 97 |
-
print(f"Collection
|
| 98 |
utility.drop_collection(collection_name)
|
| 99 |
return
|
| 100 |
else:
|
| 101 |
-
print(f"Collection '{collection_name}' has correct dimension: {expected_dim}.")
|
| 102 |
return
|
| 103 |
-
print(f"Could not find vector field in collection '{collection_name}'. Dropping for safety.")
|
| 104 |
utility.drop_collection(collection_name)
|
| 105 |
else:
|
| 106 |
-
print(f"Collection '{collection_name}' does not exist. It will be created.")
|
| 107 |
except Exception as e:
|
| 108 |
print(f"Failed to validate or drop collection: {e}")
|
| 109 |
|
|
@@ -113,9 +122,9 @@ def load_faqs():
|
|
| 113 |
if not os.path.exists(faq_path):
|
| 114 |
pd.DataFrame(columns=["id", "prompt", "response"]).to_csv(faq_path, index=False, encoding="utf-8")
|
| 115 |
|
| 116 |
-
# Check collection schema
|
| 117 |
ensure_collection_matches_schema(
|
| 118 |
-
expected_dim=
|
| 119 |
collection_name=collection_name,
|
| 120 |
uri=milvus_uri,
|
| 121 |
token=milvus_token
|
|
|
|
| 14 |
from langchain.docstore.document import Document
|
| 15 |
from langchain_community.document_loaders import CSVLoader
|
| 16 |
from langchain.schema import SystemMessage, HumanMessage
|
| 17 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 18 |
import difflib
|
| 19 |
from pymilvus import connections, utility, Collection
|
| 20 |
from pymilvus.orm.schema import FieldSchema
|
|
|
|
| 27 |
# ---------------------- File & Model Config ----------------------
|
| 28 |
|
| 29 |
faq_path = "faqs.csv"
|
| 30 |
+
|
| 31 |
+
# 💰 FREE Embeddings - Sentence Transformers (saves ~$18/day!)
|
| 32 |
+
print("🚀 Loading FREE embedding model (all-mpnet-base-v2)...")
|
| 33 |
+
embedding_model = HuggingFaceEmbeddings(
|
| 34 |
+
model_name="sentence-transformers/all-mpnet-base-v2",
|
| 35 |
+
model_kwargs={'device': 'cpu'},
|
| 36 |
+
encode_kwargs={'normalize_embeddings': True}
|
| 37 |
)
|
| 38 |
+
print("✅ FREE embedding model loaded successfully!")
|
| 39 |
+
|
| 40 |
+
# NOTE: Dimension changed from 1536 (OpenAI) to 768 (mpnet)
|
| 41 |
+
EMBEDDING_DIM = 768
|
| 42 |
|
| 43 |
IMPORTANT_KEYWORDS = [
|
| 44 |
"visa", "permanent residency", "PR", "study permit", "work permit", "immigration",
|
|
|
|
| 103 |
if field.params and "dim" in field.params:
|
| 104 |
actual_dim = int(field.params["dim"])
|
| 105 |
if actual_dim != expected_dim:
|
| 106 |
+
print(f"⚠️ Collection dim mismatch: {actual_dim} vs {expected_dim}. Dropping old collection.")
|
| 107 |
utility.drop_collection(collection_name)
|
| 108 |
return
|
| 109 |
else:
|
| 110 |
+
print(f"✅ Collection '{collection_name}' has correct dimension: {expected_dim}.")
|
| 111 |
return
|
| 112 |
+
print(f"⚠️ Could not find vector field in collection '{collection_name}'. Dropping for safety.")
|
| 113 |
utility.drop_collection(collection_name)
|
| 114 |
else:
|
| 115 |
+
print(f"📝 Collection '{collection_name}' does not exist. It will be created.")
|
| 116 |
except Exception as e:
|
| 117 |
print(f"Failed to validate or drop collection: {e}")
|
| 118 |
|
|
|
|
| 122 |
if not os.path.exists(faq_path):
|
| 123 |
pd.DataFrame(columns=["id", "prompt", "response"]).to_csv(faq_path, index=False, encoding="utf-8")
|
| 124 |
|
| 125 |
+
# Check collection schema (768 dims for mpnet vs 1536 for OpenAI)
|
| 126 |
ensure_collection_matches_schema(
|
| 127 |
+
expected_dim=768, # FREE model dimension
|
| 128 |
collection_name=collection_name,
|
| 129 |
uri=milvus_uri,
|
| 130 |
token=milvus_token
|
ircc_updater.py
CHANGED
|
@@ -1,18 +1,13 @@
|
|
| 1 |
# updated ircc_updater.py
|
| 2 |
import requests
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
-
from langchain_openai import OpenAIEmbeddings
|
| 5 |
from langchain.schema import Document
|
| 6 |
-
from faq_services import db
|
| 7 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 8 |
import os
|
| 9 |
from datetime import datetime
|
| 10 |
|
| 11 |
-
#
|
| 12 |
-
embedding_model = OpenAIEmbeddings(
|
| 13 |
-
model="text-embedding-3-small",
|
| 14 |
-
openai_api_key=os.getenv("OPENAI_API_KEY")
|
| 15 |
-
)
|
| 16 |
|
| 17 |
|
| 18 |
# Main IRCC pages to crawl for links
|
|
|
|
| 1 |
# updated ircc_updater.py
|
| 2 |
import requests
|
| 3 |
from bs4 import BeautifulSoup
|
|
|
|
| 4 |
from langchain.schema import Document
|
| 5 |
+
from faq_services import db, embedding_model
|
| 6 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 7 |
import os
|
| 8 |
from datetime import datetime
|
| 9 |
|
| 10 |
+
# Note: Using FREE embeddings from faq_services (all-mpnet-base-v2)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
# Main IRCC pages to crawl for links
|
requirements.txt
CHANGED
|
@@ -10,6 +10,10 @@ langchain-openai
|
|
| 10 |
openai
|
| 11 |
tiktoken
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
# Vector DB (Milvus/Zilliz only)
|
| 14 |
pymilvus
|
| 15 |
|
|
|
|
| 10 |
openai
|
| 11 |
tiktoken
|
| 12 |
|
| 13 |
+
# FREE Embeddings (Sentence Transformers)
|
| 14 |
+
sentence-transformers
|
| 15 |
+
torch
|
| 16 |
+
|
| 17 |
# Vector DB (Milvus/Zilliz only)
|
| 18 |
pymilvus
|
| 19 |
|