Arnavkumar01 commited on
Commit
9b454aa
·
1 Parent(s): 749195f

Initial Commit

Browse files
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1. Start with a lean and official Python base image
2
+ FROM python:3.10-slim
3
+
4
+ # 2. Set the working directory inside the container
5
+ WORKDIR /app
6
+
7
+ # 3. Copy only the requirements file first to leverage Docker's caching
8
+ COPY requirements.txt .
9
+
10
+ # 4. Install all Python dependencies from requirements.txt
11
+ # This includes the extra PyTorch URL your file specifies.
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # 5. Now, copy your application code into the container
15
+ # This assumes your code is in a folder named 'app'
16
+ COPY ./app /app/app
17
+
18
+ # 6. Define the command to run your application using Gunicorn
19
+ # This command points to the 'app' object inside your 'main.py' file within the 'app' module.
20
+ CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "--worker-class", "uvicorn.workers.UvicornWorker", "--timeout", "0", "app.main:app"]
app/__init__.py ADDED
File without changes
app/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (113 Bytes). View file
 
app/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (143 Bytes). View file
 
app/__pycache__/main.cpython-310.pyc ADDED
Binary file (672 Bytes). View file
 
app/__pycache__/main.cpython-312.pyc ADDED
Binary file (624 Bytes). View file
 
app/api/__init__.py ADDED
File without changes
app/api/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (117 Bytes). View file
 
app/api/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (147 Bytes). View file
 
app/api/__pycache__/navigator.cpython-310.pyc ADDED
Binary file (1.57 kB). View file
 
app/api/__pycache__/navigator.cpython-312.pyc ADDED
Binary file (2.01 kB). View file
 
app/api/navigator.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException, Depends
2
+ from pydantic import BaseModel
3
+ # This is the correct line
4
+ from app.services import rag_service
5
+ from app.core.security import get_api_key
6
+
7
+ router = APIRouter()
8
+
9
+ class QueryRequest(BaseModel):
10
+ query: str
11
+ top_k: int = 10
12
+
13
+ @router.post("/navigator/query", dependencies=[Depends(get_api_key)])
14
+ def navigator_query(request: QueryRequest):
15
+ """
16
+ Accepts a user query, performs a hybrid search to find relevant professionals,
17
+ generates a summary, and returns a comprehensive response.
18
+ """
19
+ if not request.query:
20
+ raise HTTPException(status_code=400, detail="Query cannot be empty.")
21
+
22
+ try:
23
+ # Step 1: Retrieve relevant documents using hybrid search
24
+ retrieved_docs = rag_service.hybrid_search(request.query, k=request.top_k)
25
+
26
+ # Step 2: Generate a summary response based on the retrieved docs
27
+ summary = rag_service.generate_summary_response(request.query, retrieved_docs)
28
+
29
+ # Step 3: Format the retrieved docs for a clean JSON response
30
+ results = [
31
+ {
32
+ # Convert the numpy.int64 to a standard Python int
33
+ "professional_id": int(doc.metadata.get("professional_id")),
34
+ "content": doc.page_content
35
+ }
36
+ for doc in retrieved_docs
37
+ ]
38
+
39
+ return {
40
+ "summary": summary,
41
+ "professionals": results
42
+ }
43
+ except Exception as e:
44
+ # Adding a print statement here helps with debugging
45
+ print(f"🔥🔥🔥 An error occurred: {e}")
46
+ raise HTTPException(status_code=500, detail=str(e))
app/core/__init__.py ADDED
File without changes
app/core/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (118 Bytes). View file
 
app/core/__pycache__/security.cpython-310.pyc ADDED
Binary file (662 Bytes). View file
 
app/core/security.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from fastapi import Security, HTTPException, status
3
+ from fastapi.security import APIKeyHeader
4
+
5
+ # Define the name of the header we expect
6
+ api_key_header = APIKeyHeader(name="X-API-Key")
7
+
8
+ # Get the secret key from the environment variables
9
+ SECRET_KEY = os.getenv("INTERNAL_API_KEY")
10
+
11
+ async def get_api_key(api_key: str = Security(api_key_header)):
12
+ """Checks if the provided API key is valid."""
13
+ if api_key == SECRET_KEY:
14
+ return api_key
15
+ else:
16
+ raise HTTPException(
17
+ status_code=status.HTTP_401_UNAUTHORIZED,
18
+ detail="Invalid or missing API Key",
19
+ )
app/main.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/main.py
2
+ from fastapi import FastAPI
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from .api import navigator
5
+
6
+ # This is your main FastAPI application instance
7
+ app = FastAPI(title="Navigator AI API")
8
+
9
+ # This is the "guest list" of addresses that are allowed to connect.
10
+ # Add the default address for the Next.js development server.
11
+ origins = [
12
+ "http://localhost:3000", # For local development of your frontend
13
+ "https://business-orbit.onrender.com", # Your production frontend URL
14
+ ]
15
+
16
+ app.add_middleware(
17
+ CORSMiddleware,
18
+ allow_origins=origins,
19
+ allow_credentials=True,
20
+ allow_methods=["*"], # Allows all methods (GET, POST, etc.)
21
+ allow_headers=["*"], # Allows all headers
22
+ )
23
+
24
+ # This line includes the API routes from navigator.py
25
+ app.include_router(navigator.router, prefix="/api")
26
+
27
+ @app.get("/")
28
+ def read_root():
29
+ return {"Status": "API is running"}
app/models.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/models.py
2
+ from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text
3
+ from sqlalchemy.orm import declarative_base
4
+ from sqlalchemy.sql import func
5
+
6
+ # In newer versions of SQLAlchemy, declarative_base is imported from sqlalchemy.orm
7
+ Base = declarative_base()
8
+
9
+ class Professional(Base):
10
+ __tablename__ = 'professionals_denormalized'
11
+
12
+ id = Column(Integer, primary_key=True) # SERIAL PRIMARY KEY is handled
13
+ name = Column(String(255), nullable=False)
14
+ professional_role = Column(String(255))
15
+ email = Column(String(255), unique=True, index=True, nullable=False)
16
+ phone = Column(String(20), unique=True) # Added unique constraint based on common usage
17
+ password_hash = Column(String(255), nullable=False)
18
+
19
+ # New and updated text/varchar fields
20
+ profile_photo_url = Column(Text)
21
+ profile_photo_id = Column(String(255))
22
+ banner_url = Column(Text)
23
+ banner_id = Column(String(255))
24
+ skills = Column(Text) # Changed from JSON to Text to match your SQL
25
+ description = Column(Text)
26
+ google_id = Column(String(255))
27
+ linkedin_id = Column(String(255))
28
+ role = Column(String(50))
29
+ avatar = Column(String(5)) # Added length limit
30
+
31
+ # Boolean and Integer fields
32
+ is_admin = Column(Boolean, default=False)
33
+ rewardScore = Column("rewardScore", Integer, default=0) # Explicitly name to handle case-sensitivity
34
+ matchScore = Column("matchScore", Integer, default=0)
35
+
36
+ # New location fields
37
+ location1 = Column(String(255))
38
+ location2 = Column(String(255))
39
+ location3 = Column(String(255))
40
+ location4 = Column(String(255))
41
+ location5 = Column(String(255))
42
+
43
+ # Kept the automatic timestamping from your original model as it's a best practice
44
+ created_at = Column(DateTime(timezone=True), server_default=func.now())
45
+ updated_at = Column(DateTime(timezone=True), onupdate=func.now())
app/services/__pycache__/rag_service.cpython-310.pyc ADDED
Binary file (4.29 kB). View file
 
app/services/__pycache__/rag_service.cpython-312.pyc ADDED
Binary file (6.04 kB). View file
 
app/services/rag_service.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file: rag_service.py
2
+
3
+ import os
4
+ import pandas as pd
5
+ from sqlalchemy import create_engine
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_community.vectorstores.pgvector import PGVector # Replaces FAISS
8
+ from rank_bm25 import BM25Okapi
9
+ import google.generativeai as genai
10
+ from dotenv import load_dotenv
11
+
12
+ # --- CONFIGURATION ---
13
+ load_dotenv()
14
+ MODEL_NAME = "BAAI/bge-large-en-v1.5"
15
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
16
+ DB_URL = os.getenv("DATABASE_URL")
17
+ COLLECTION_NAME = "professionals_embeddings_hybrid" # Must match the ingestion script
18
+
19
+ # --- INITIALIZATION ---
20
+ # Initialize Gemini (Your original code)
21
+ genai.configure(api_key=GEMINI_API_KEY)
22
+ llm = genai.GenerativeModel('gemini-1.5-pro') # Using 1.5-pro as it's the latest powerful model
23
+
24
+ # Initialize Embedding Model (Your original code)
25
+ model_kwargs = {'device': 'cpu'}
26
+ encode_kwargs = {'normalize_embeddings': True}
27
+ embeddings = HuggingFaceEmbeddings(
28
+ model_name=MODEL_NAME,
29
+ model_kwargs=model_kwargs,
30
+ encode_kwargs=encode_kwargs
31
+ )
32
+
33
+ # Load data and initialize BM25 on startup (Your original logic)
34
+ print("Initializing BM25 Keyword Search...")
35
+ engine = create_engine(DB_URL)
36
+ df = pd.read_sql('SELECT id, name, professional_role, skills, description FROM professionals_denormalized', engine)
37
+ corpus = (df['name'] + ". " + df['professional_role'] + ". " + df['skills'] + ". " + df['description']).fillna("").tolist()
38
+ tokenized_corpus = [doc.lower().split(" ") for doc in corpus]
39
+ bm25 = BM25Okapi(tokenized_corpus)
40
+ print("BM25 Initialized.")
41
+
42
+ # --- HYBRID SEARCH LOGIC ---
43
+ def reciprocal_rank_fusion(ranked_lists, k=60):
44
+ # This function is unchanged
45
+ fused_scores = {}
46
+ for doc_list in ranked_lists:
47
+ for i, doc in enumerate(doc_list):
48
+ doc_id = doc.metadata['professional_id']
49
+ if doc_id not in fused_scores:
50
+ fused_scores[doc_id] = 0
51
+ fused_scores[doc_id] += 1 / (i + k)
52
+ reranked_results = sorted(fused_scores.items(), key=lambda item: item[1], reverse=True)
53
+ return reranked_results
54
+
55
+ def hybrid_search(query: str, k: int = 10):
56
+ # 1. Keyword Search (BM25) - This logic is unchanged
57
+ tokenized_query = query.lower().split(" ")
58
+ bm25_scores = bm25.get_scores(tokenized_query)
59
+ top_n_bm25_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:k]
60
+ bm25_results = []
61
+ for i in top_n_bm25_indices:
62
+ metadata = {"professional_id": df.iloc[i]['id']}
63
+ bm25_results.append(type('Document', (), {'page_content': corpus[i], 'metadata': metadata})())
64
+
65
+ # 2. Semantic Search (THE ONLY CHANGE IS HERE)
66
+ # Instead of loading from a file, we connect to the live database.
67
+ vector_store = PGVector(
68
+ connection_string=DB_URL,
69
+ embedding_function=embeddings,
70
+ collection_name=COLLECTION_NAME
71
+ )
72
+ semantic_results = vector_store.similarity_search(query, k=k)
73
+
74
+ # 3. Fuse the results - This logic is unchanged
75
+ fused_ranks = reciprocal_rank_fusion([semantic_results, bm25_results])
76
+
77
+ # 4. Retrieve the final documents - This logic is unchanged
78
+ final_results = []
79
+ for doc_id, score in fused_ranks[:k]:
80
+ found_doc = next((doc for doc in semantic_results if doc.metadata['professional_id'] == doc_id), None)
81
+ if not found_doc:
82
+ found_doc = next((doc for doc in bm25_results if doc.metadata['professional_id'] == doc_id), None)
83
+ if found_doc:
84
+ final_results.append(found_doc)
85
+
86
+ return final_results
87
+
88
+ # --- GENERATION LOGIC ---
89
+ def generate_summary_response(query: str, retrieved_docs: list):
90
+ # This function is unchanged
91
+ if not retrieved_docs:
92
+ return "I could not find any professionals that match your query."
93
+ context = "\n\n---\n\n".join([doc.page_content for doc in retrieved_docs])
94
+ prompt = f"""
95
+ You are the Navigator AI assistant. Your task is to provide a helpful and concise summary based on the provided professional profiles.
96
+
97
+ Based on the following retrieved profiles:
98
+ ---
99
+ {context}
100
+ ---
101
+
102
+ Please answer the user's original query: "{query}"
103
+
104
+ Your response should be a brief, natural language summary that synthesizes the information and highlights why these professionals are a good match.
105
+ """
106
+ try:
107
+ response = llm.generate_content(prompt)
108
+ return response.text
109
+ except Exception as e:
110
+ print(f"🔥 Gemini API call failed: {e}")
111
+ return "I was unable to generate a summary at this time."
app/services/rag_service_old.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/services/rag_service.py
2
+ import os
3
+ import pandas as pd
4
+ from sqlalchemy import create_engine
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+ from rank_bm25 import BM25Okapi
8
+ import google.generativeai as genai
9
+ from dotenv import load_dotenv
10
+
11
+ # --- CONFIGURATION ---
12
+ load_dotenv()
13
+ VECTOR_STORE_PATH = "vector_store"
14
+ MODEL_NAME = "BAAI/bge-large-en-v1.5"
15
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
16
+
17
+ # --- INITIALIZATION ---
18
+ # Initialize Gemini
19
+ genai.configure(api_key=GEMINI_API_KEY)
20
+ llm = genai.GenerativeModel('gemini-2.5-pro')
21
+
22
+ # Initialize Embedding Model
23
+ model_kwargs = {'device': 'cpu'}
24
+ encode_kwargs = {'normalize_embeddings': True}
25
+ embeddings = HuggingFaceEmbeddings(
26
+ model_name=MODEL_NAME,
27
+ model_kwargs=model_kwargs,
28
+ encode_kwargs=encode_kwargs
29
+ )
30
+
31
+ # Load the vector store on startup
32
+ vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
33
+
34
+ # Load data for BM25 Keyword Search
35
+ db_url = os.getenv("DATABASE_URL")
36
+ engine = create_engine(db_url)
37
+ df = pd.read_sql('SELECT id, name, professional_role, skills, description FROM professionals_denormalized', engine)
38
+ # Create the text corpus for BM25
39
+ corpus = (df['name'] + ". " + df['professional_role'] + ". " + df['skills'] + ". " + df['description']).fillna("").tolist()
40
+ tokenized_corpus = [doc.lower().split(" ") for doc in corpus]
41
+ bm25 = BM25Okapi(tokenized_corpus)
42
+
43
+ # --- HYBRID SEARCH LOGIC ---
44
+ def reciprocal_rank_fusion(ranked_lists, k=60):
45
+ fused_scores = {}
46
+ for doc_list in ranked_lists:
47
+ for i, doc in enumerate(doc_list):
48
+ doc_id = doc.metadata['professional_id']
49
+ if doc_id not in fused_scores:
50
+ fused_scores[doc_id] = 0
51
+ fused_scores[doc_id] += 1 / (i + k)
52
+
53
+ reranked_results = sorted(fused_scores.items(), key=lambda item: item[1], reverse=True)
54
+ return reranked_results
55
+
56
+ def hybrid_search(query: str, k: int = 10):
57
+ # 1. Keyword Search (BM25)
58
+ tokenized_query = query.lower().split(" ")
59
+ bm25_scores = bm25.get_scores(tokenized_query)
60
+ top_n_bm25_indices = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:k]
61
+
62
+ bm25_results = []
63
+ for i in top_n_bm25_indices:
64
+ metadata = {"professional_id": df.iloc[i]['id']}
65
+ bm25_results.append(type('Document', (), {'page_content': corpus[i], 'metadata': metadata})())
66
+
67
+ # 2. Semantic Search (FAISS)
68
+ semantic_results = vector_store.similarity_search(query, k=k)
69
+
70
+ # 3. Fuse the results
71
+ fused_ranks = reciprocal_rank_fusion([semantic_results, bm25_results])
72
+
73
+ # 4. Retrieve the final documents
74
+ final_results = []
75
+ for doc_id, score in fused_ranks[:k]:
76
+ # Find the original document from our FAISS results or BM25 results
77
+ found_doc = next((doc for doc in semantic_results if doc.metadata['professional_id'] == doc_id), None)
78
+ if not found_doc:
79
+ found_doc = next((doc for doc in bm25_results if doc.metadata['professional_id'] == doc_id), None)
80
+
81
+ if found_doc:
82
+ final_results.append(found_doc)
83
+
84
+ return final_results
85
+
86
+ # --- GENERATION LOGIC ---
87
+ def generate_summary_response(query: str, retrieved_docs: list):
88
+ """
89
+ Generates a natural language summary using the retrieved documents.
90
+ """
91
+ if not retrieved_docs:
92
+ return "I could not find any professionals that match your query.", []
93
+
94
+ # Prepare context from retrieved docs
95
+ context = "\n\n---\n\n".join([doc.page_content for doc in retrieved_docs])
96
+
97
+ prompt = f"""
98
+ You are the Navigator AI assistant. Your task is to provide a helpful and concise summary based on the provided professional profiles.
99
+
100
+ Based on the following retrieved profiles:
101
+ ---
102
+ {context}
103
+ ---
104
+
105
+ Please answer the user's original query: "{query}"
106
+
107
+ Your response should be a brief, natural language summary that synthesizes the information and highlights why these professionals are a good match.
108
+ """
109
+
110
+ try:
111
+ response = llm.generate_content(prompt)
112
+ return response.text
113
+ except Exception as e:
114
+ print(f"🔥 Gemini API call failed: {e}")
115
+ return "I was unable to generate a summary at this time."
requirements.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This line tells pip to use the main repository AND the PyTorch CPU repository
2
+ --extra-index-url https://download.pytorch.org/whl/cpu
3
+
4
+ # Core API Framework
5
+ fastapi==0.115.13
6
+ uvicorn==0.34.3
7
+ gunicorn==23.0.0
8
+
9
+ # Database & Vector Store
10
+ psycopg2-binary==2.9.10
11
+ SQLAlchemy==2.0.40
12
+ pgvector==0.4.1
13
+
14
+ # RAG & Machine Learning
15
+ langchain-huggingface==0.3.1
16
+ langchain-community==0.3.30
17
+ sentence-transformers==5.1.1
18
+ rank-bm25==0.2.2
19
+ google-generativeai==0.8.5
20
+ pandas==2.2.3
21
+ torch
22
+
23
+ # Utilities
24
+ python-dotenv==1.1.0