IsmatS commited on
Commit
ba54c7e
·
1 Parent(s): a9089e1
Files changed (1) hide show
  1. app/main.py +25 -58
app/main.py CHANGED
@@ -27,6 +27,7 @@ from pydantic import BaseModel
27
  from dotenv import load_dotenv
28
  from openai import AzureOpenAI
29
  from pinecone import Pinecone
 
30
 
31
  # Load environment variables
32
  load_dotenv()
@@ -88,6 +89,7 @@ templates = Jinja2Templates(directory=str(TEMPLATES_DIR))
88
  # Initialize clients (lazy loading for faster startup)
89
  azure_client = None
90
  pinecone_index = None
 
91
 
92
 
93
  def get_azure_client():
@@ -111,66 +113,31 @@ def get_pinecone_index():
111
  return pinecone_index
112
 
113
 
 
 
 
 
 
 
 
 
 
 
114
  def get_embedding(text: str) -> List[float]:
115
  """
116
  Generate embedding for semantic search.
117
 
118
- Uses separate Azure OpenAI resource for embeddings (memory-efficient for Render free tier).
119
- Supports custom endpoint/key via AZURE_EMBEDDING_* environment variables.
120
  """
121
- # Check if using separate embedding resource
122
- embedding_endpoint = os.getenv("AZURE_EMBEDDING_ENDPOINT")
123
- embedding_api_key = os.getenv("AZURE_EMBEDDING_API_KEY")
124
-
125
- if embedding_endpoint and embedding_api_key:
126
- # Use separate embedding client
127
- embedding_client = AzureOpenAI(
128
- api_key=embedding_api_key,
129
- api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-08-01-preview"),
130
- azure_endpoint=embedding_endpoint
131
- )
132
- else:
133
- # Fallback to main Azure client
134
- embedding_client = get_azure_client()
135
-
136
- # Get embedding model from env or use default
137
- embedding_model = os.getenv("AZURE_EMBEDDING_MODEL", "text-embedding-3-small")
138
- embedding_dims = int(os.getenv("AZURE_EMBEDDING_DIMS", "1024"))
139
-
140
  try:
141
- # Azure OpenAI doesn't support 'dimensions' parameter (different from OpenAI API)
142
- # Just create embedding and handle dimension mismatch afterward
143
- response = embedding_client.embeddings.create(
144
- input=text,
145
- model=embedding_model
146
- )
147
-
148
- embedding = response.data[0].embedding
149
-
150
- # text-embedding-3-small returns 1536 dims by default, but Pinecone expects 1024
151
- # Truncate or pad to match expected dimensions
152
- if len(embedding) != embedding_dims:
153
- if len(embedding) < embedding_dims:
154
- # Pad with zeros
155
- embedding = embedding + [0.0] * (embedding_dims - len(embedding))
156
- else:
157
- # Truncate to required dimensions
158
- embedding = embedding[:embedding_dims]
159
-
160
  return embedding
161
  except Exception as e:
162
- error_msg = str(e)
163
-
164
- # Provide helpful error message
165
- if "DeploymentNotFound" in error_msg or "404" in error_msg:
166
- print(f"❌ EMBEDDING ERROR: Deployment '{embedding_model}' not found")
167
- print(f" Endpoint: {embedding_endpoint or os.getenv('AZURE_OPENAI_ENDPOINT')}")
168
- print(f" Model: {embedding_model}")
169
- else:
170
- print(f"Embedding error: {e}")
171
-
172
  # Return zero vector (will not match documents, but API won't crash)
173
- return [0.0] * embedding_dims
174
 
175
 
176
  # Request/Response models
@@ -204,12 +171,12 @@ class AnswerResponse(BaseModel):
204
  response_time: float
205
 
206
 
207
- def retrieve_documents(query: str, top_k: int = 10) -> List[Dict]:
208
  """
209
  Retrieve relevant documents from Pinecone vector database.
210
- Increased to top-10 due to dimension truncation (1536→1024) affecting similarity scores.
211
 
212
- Uses Azure OpenAI embeddings (truncated to 1024-dim for Pinecone compatibility).
213
  """
214
  index = get_pinecone_index()
215
 
@@ -327,13 +294,13 @@ async def llm_endpoint(request: Request):
327
  LLM chatbot endpoint for SOCAR historical documents.
328
 
329
  Uses RAG (Retrieval Augmented Generation) with:
330
- - Embedding: Azure OpenAI text-embedding-3-small @ 1024-dim
331
  - Retrieval: Top-3 documents (Pinecone)
332
  - LLM: Llama-4-Maverick-17B (open-source)
333
  - Prompt: Citation-focused
334
 
335
  Expected performance:
336
- - Response time: ~4.0s
337
  - LLM Judge Score: 55.67%
338
  - Citation Score: 73.33%
339
 
@@ -421,8 +388,8 @@ async def llm_endpoint(request: Request):
421
  response_time=0.0
422
  )
423
 
424
- # Retrieve relevant documents (increased to 10 due to dimension truncation issues)
425
- documents = retrieve_documents(query, top_k=10)
426
 
427
  # Generate answer
428
  answer, response_time = generate_answer(
 
27
  from dotenv import load_dotenv
28
  from openai import AzureOpenAI
29
  from pinecone import Pinecone
30
+ from sentence_transformers import SentenceTransformer
31
 
32
  # Load environment variables
33
  load_dotenv()
 
89
  # Initialize clients (lazy loading for faster startup)
90
  azure_client = None
91
  pinecone_index = None
92
+ embedding_model = None
93
 
94
 
95
  def get_azure_client():
 
113
  return pinecone_index
114
 
115
 
116
+ def get_embedding_model():
117
+ """Lazy load local embedding model (same as ingestion: BAAI/bge-large-en-v1.5)"""
118
+ global embedding_model
119
+ if embedding_model is None:
120
+ print("Loading BAAI/bge-large-en-v1.5 embedding model...")
121
+ embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5")
122
+ print("✅ Embedding model loaded")
123
+ return embedding_model
124
+
125
+
126
  def get_embedding(text: str) -> List[float]:
127
  """
128
  Generate embedding for semantic search.
129
 
130
+ Uses BAAI/bge-large-en-v1.5 (same as document ingestion) for consistent embeddings.
131
+ Returns 1024-dimensional vector matching Pinecone index.
132
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  try:
134
+ model = get_embedding_model()
135
+ embedding = model.encode(text).tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  return embedding
137
  except Exception as e:
138
+ print(f"Embedding error: {e}")
 
 
 
 
 
 
 
 
 
139
  # Return zero vector (will not match documents, but API won't crash)
140
+ return [0.0] * 1024
141
 
142
 
143
  # Request/Response models
 
171
  response_time: float
172
 
173
 
174
+ def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:
175
  """
176
  Retrieve relevant documents from Pinecone vector database.
177
+ Best strategy from benchmark: vanilla top-3 with BAAI/bge-large-en-v1.5
178
 
179
+ Uses BAAI/bge-large-en-v1.5 embeddings (1024-dim, same as ingestion).
180
  """
181
  index = get_pinecone_index()
182
 
 
294
  LLM chatbot endpoint for SOCAR historical documents.
295
 
296
  Uses RAG (Retrieval Augmented Generation) with:
297
+ - Embedding: BAAI/bge-large-en-v1.5 @ 1024-dim (local model)
298
  - Retrieval: Top-3 documents (Pinecone)
299
  - LLM: Llama-4-Maverick-17B (open-source)
300
  - Prompt: Citation-focused
301
 
302
  Expected performance:
303
+ - Response time: ~3.6s
304
  - LLM Judge Score: 55.67%
305
  - Citation Score: 73.33%
306
 
 
388
  response_time=0.0
389
  )
390
 
391
+ # Retrieve relevant documents (top-3 is optimal per benchmarks)
392
+ documents = retrieve_documents(query, top_k=3)
393
 
394
  # Generate answer
395
  answer, response_time = generate_answer(