shakauthossain commited on
Commit
a33c0b6
·
1 Parent(s): 728520f

Added: Free Embedding

Browse files
Files changed (4) hide show
  1. .env +1 -1
  2. faq_services.py +18 -9
  3. ircc_updater.py +2 -7
  4. requirements.txt +4 -0
.env CHANGED
@@ -2,4 +2,4 @@ GOOGLE_API_KEY=AIzaSyBA4RZNkKsqwkRrMCPbio7zrQ4xo12XpHI
2
  ZILLIZ_URI=https://in03-1da78c9c65effba.serverless.aws-eu-central-1.cloud.zilliz.com
3
  ZILLIZ_TOKEN=02a96317810ef222f0752a53bd9a3a2d0e46740ccf85f8ec4d18a4427bface8e82d57735f54fb0592802ed883573aeee40e9f529
4
  ZILLIZ_COLLECTION=visaverse_faqs
5
- OPENAI_API_KEY=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
 
2
  ZILLIZ_URI=https://in03-1da78c9c65effba.serverless.aws-eu-central-1.cloud.zilliz.com
3
  ZILLIZ_TOKEN=02a96317810ef222f0752a53bd9a3a2d0e46740ccf85f8ec4d18a4427bface8e82d57735f54fb0592802ed883573aeee40e9f529
4
  ZILLIZ_COLLECTION=visaverse_faqs
5
+ OPENAI_API_KEY=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
faq_services.py CHANGED
@@ -14,6 +14,7 @@ from langchain.schema import HumanMessage
14
  from langchain.docstore.document import Document
15
  from langchain_community.document_loaders import CSVLoader
16
  from langchain.schema import SystemMessage, HumanMessage
 
17
  import difflib
18
  from pymilvus import connections, utility, Collection
19
  from pymilvus.orm.schema import FieldSchema
@@ -26,10 +27,18 @@ os.environ["HF_HOME"] = "/tmp/hf_cache" # Optional cleanup
26
  # ---------------------- File & Model Config ----------------------
27
 
28
  faq_path = "faqs.csv"
29
- embedding_model = OpenAIEmbeddings(
30
- model="text-embedding-3-small",
31
- openai_api_key=os.getenv("OPENAI_API_KEY")
 
 
 
 
32
  )
 
 
 
 
33
 
34
  IMPORTANT_KEYWORDS = [
35
  "visa", "permanent residency", "PR", "study permit", "work permit", "immigration",
@@ -94,16 +103,16 @@ def ensure_collection_matches_schema(expected_dim: int, collection_name: str, ur
94
  if field.params and "dim" in field.params:
95
  actual_dim = int(field.params["dim"])
96
  if actual_dim != expected_dim:
97
- print(f"Collection '{collection_name}' has dim {actual_dim}, expected {expected_dim}. Dropping it.")
98
  utility.drop_collection(collection_name)
99
  return
100
  else:
101
- print(f"Collection '{collection_name}' has correct dimension: {expected_dim}.")
102
  return
103
- print(f"Could not find vector field in collection '{collection_name}'. Dropping for safety.")
104
  utility.drop_collection(collection_name)
105
  else:
106
- print(f"Collection '{collection_name}' does not exist. It will be created.")
107
  except Exception as e:
108
  print(f"Failed to validate or drop collection: {e}")
109
 
@@ -113,9 +122,9 @@ def load_faqs():
113
  if not os.path.exists(faq_path):
114
  pd.DataFrame(columns=["id", "prompt", "response"]).to_csv(faq_path, index=False, encoding="utf-8")
115
 
116
- # Check collection schema
117
  ensure_collection_matches_schema(
118
- expected_dim=1536,
119
  collection_name=collection_name,
120
  uri=milvus_uri,
121
  token=milvus_token
 
14
  from langchain.docstore.document import Document
15
  from langchain_community.document_loaders import CSVLoader
16
  from langchain.schema import SystemMessage, HumanMessage
17
+ from langchain_community.embeddings import HuggingFaceEmbeddings
18
  import difflib
19
  from pymilvus import connections, utility, Collection
20
  from pymilvus.orm.schema import FieldSchema
 
27
  # ---------------------- File & Model Config ----------------------
28
 
29
  faq_path = "faqs.csv"
30
+
31
+ # 💰 FREE Embeddings - Sentence Transformers (saves ~$18/day!)
32
+ print("🚀 Loading FREE embedding model (all-mpnet-base-v2)...")
33
+ embedding_model = HuggingFaceEmbeddings(
34
+ model_name="sentence-transformers/all-mpnet-base-v2",
35
+ model_kwargs={'device': 'cpu'},
36
+ encode_kwargs={'normalize_embeddings': True}
37
  )
38
+ print("✅ FREE embedding model loaded successfully!")
39
+
40
+ # NOTE: Dimension changed from 1536 (OpenAI) to 768 (mpnet)
41
+ EMBEDDING_DIM = 768
42
 
43
  IMPORTANT_KEYWORDS = [
44
  "visa", "permanent residency", "PR", "study permit", "work permit", "immigration",
 
103
  if field.params and "dim" in field.params:
104
  actual_dim = int(field.params["dim"])
105
  if actual_dim != expected_dim:
106
+ print(f"⚠️ Collection dim mismatch: {actual_dim} vs {expected_dim}. Dropping old collection.")
107
  utility.drop_collection(collection_name)
108
  return
109
  else:
110
+ print(f"Collection '{collection_name}' has correct dimension: {expected_dim}.")
111
  return
112
+ print(f"⚠️ Could not find vector field in collection '{collection_name}'. Dropping for safety.")
113
  utility.drop_collection(collection_name)
114
  else:
115
+ print(f"📝 Collection '{collection_name}' does not exist. It will be created.")
116
  except Exception as e:
117
  print(f"Failed to validate or drop collection: {e}")
118
 
 
122
  if not os.path.exists(faq_path):
123
  pd.DataFrame(columns=["id", "prompt", "response"]).to_csv(faq_path, index=False, encoding="utf-8")
124
 
125
+ # Check collection schema (768 dims for mpnet vs 1536 for OpenAI)
126
  ensure_collection_matches_schema(
127
+ expected_dim=768, # FREE model dimension
128
  collection_name=collection_name,
129
  uri=milvus_uri,
130
  token=milvus_token
ircc_updater.py CHANGED
@@ -1,18 +1,13 @@
1
  # updated ircc_updater.py
2
  import requests
3
  from bs4 import BeautifulSoup
4
- from langchain_openai import OpenAIEmbeddings
5
  from langchain.schema import Document
6
- from faq_services import db
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
  import os
9
  from datetime import datetime
10
 
11
- # Config
12
- embedding_model = OpenAIEmbeddings(
13
- model="text-embedding-3-small",
14
- openai_api_key=os.getenv("OPENAI_API_KEY")
15
- )
16
 
17
 
18
  # Main IRCC pages to crawl for links
 
1
  # updated ircc_updater.py
2
  import requests
3
  from bs4 import BeautifulSoup
 
4
  from langchain.schema import Document
5
+ from faq_services import db, embedding_model
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  import os
8
  from datetime import datetime
9
 
10
+ # Note: Using FREE embeddings from faq_services (all-mpnet-base-v2)
 
 
 
 
11
 
12
 
13
  # Main IRCC pages to crawl for links
requirements.txt CHANGED
@@ -10,6 +10,10 @@ langchain-openai
10
  openai
11
  tiktoken
12
 
 
 
 
 
13
  # Vector DB (Milvus/Zilliz only)
14
  pymilvus
15
 
 
10
  openai
11
  tiktoken
12
 
13
+ # FREE Embeddings (Sentence Transformers)
14
+ sentence-transformers
15
+ torch
16
+
17
  # Vector DB (Milvus/Zilliz only)
18
  pymilvus
19