init
Browse files- app/main.py +25 -58
app/main.py
CHANGED
|
@@ -27,6 +27,7 @@ from pydantic import BaseModel
|
|
| 27 |
from dotenv import load_dotenv
|
| 28 |
from openai import AzureOpenAI
|
| 29 |
from pinecone import Pinecone
|
|
|
|
| 30 |
|
| 31 |
# Load environment variables
|
| 32 |
load_dotenv()
|
|
@@ -88,6 +89,7 @@ templates = Jinja2Templates(directory=str(TEMPLATES_DIR))
|
|
| 88 |
# Initialize clients (lazy loading for faster startup)
|
| 89 |
azure_client = None
|
| 90 |
pinecone_index = None
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
def get_azure_client():
|
|
@@ -111,66 +113,31 @@ def get_pinecone_index():
|
|
| 111 |
return pinecone_index
|
| 112 |
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
def get_embedding(text: str) -> List[float]:
|
| 115 |
"""
|
| 116 |
Generate embedding for semantic search.
|
| 117 |
|
| 118 |
-
Uses
|
| 119 |
-
|
| 120 |
"""
|
| 121 |
-
# Check if using separate embedding resource
|
| 122 |
-
embedding_endpoint = os.getenv("AZURE_EMBEDDING_ENDPOINT")
|
| 123 |
-
embedding_api_key = os.getenv("AZURE_EMBEDDING_API_KEY")
|
| 124 |
-
|
| 125 |
-
if embedding_endpoint and embedding_api_key:
|
| 126 |
-
# Use separate embedding client
|
| 127 |
-
embedding_client = AzureOpenAI(
|
| 128 |
-
api_key=embedding_api_key,
|
| 129 |
-
api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-08-01-preview"),
|
| 130 |
-
azure_endpoint=embedding_endpoint
|
| 131 |
-
)
|
| 132 |
-
else:
|
| 133 |
-
# Fallback to main Azure client
|
| 134 |
-
embedding_client = get_azure_client()
|
| 135 |
-
|
| 136 |
-
# Get embedding model from env or use default
|
| 137 |
-
embedding_model = os.getenv("AZURE_EMBEDDING_MODEL", "text-embedding-3-small")
|
| 138 |
-
embedding_dims = int(os.getenv("AZURE_EMBEDDING_DIMS", "1024"))
|
| 139 |
-
|
| 140 |
try:
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
response = embedding_client.embeddings.create(
|
| 144 |
-
input=text,
|
| 145 |
-
model=embedding_model
|
| 146 |
-
)
|
| 147 |
-
|
| 148 |
-
embedding = response.data[0].embedding
|
| 149 |
-
|
| 150 |
-
# text-embedding-3-small returns 1536 dims by default, but Pinecone expects 1024
|
| 151 |
-
# Truncate or pad to match expected dimensions
|
| 152 |
-
if len(embedding) != embedding_dims:
|
| 153 |
-
if len(embedding) < embedding_dims:
|
| 154 |
-
# Pad with zeros
|
| 155 |
-
embedding = embedding + [0.0] * (embedding_dims - len(embedding))
|
| 156 |
-
else:
|
| 157 |
-
# Truncate to required dimensions
|
| 158 |
-
embedding = embedding[:embedding_dims]
|
| 159 |
-
|
| 160 |
return embedding
|
| 161 |
except Exception as e:
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
# Provide helpful error message
|
| 165 |
-
if "DeploymentNotFound" in error_msg or "404" in error_msg:
|
| 166 |
-
print(f"❌ EMBEDDING ERROR: Deployment '{embedding_model}' not found")
|
| 167 |
-
print(f" Endpoint: {embedding_endpoint or os.getenv('AZURE_OPENAI_ENDPOINT')}")
|
| 168 |
-
print(f" Model: {embedding_model}")
|
| 169 |
-
else:
|
| 170 |
-
print(f"Embedding error: {e}")
|
| 171 |
-
|
| 172 |
# Return zero vector (will not match documents, but API won't crash)
|
| 173 |
-
return [0.0] *
|
| 174 |
|
| 175 |
|
| 176 |
# Request/Response models
|
|
@@ -204,12 +171,12 @@ class AnswerResponse(BaseModel):
|
|
| 204 |
response_time: float
|
| 205 |
|
| 206 |
|
| 207 |
-
def retrieve_documents(query: str, top_k: int =
|
| 208 |
"""
|
| 209 |
Retrieve relevant documents from Pinecone vector database.
|
| 210 |
-
|
| 211 |
|
| 212 |
-
Uses
|
| 213 |
"""
|
| 214 |
index = get_pinecone_index()
|
| 215 |
|
|
@@ -327,13 +294,13 @@ async def llm_endpoint(request: Request):
|
|
| 327 |
LLM chatbot endpoint for SOCAR historical documents.
|
| 328 |
|
| 329 |
Uses RAG (Retrieval Augmented Generation) with:
|
| 330 |
-
- Embedding:
|
| 331 |
- Retrieval: Top-3 documents (Pinecone)
|
| 332 |
- LLM: Llama-4-Maverick-17B (open-source)
|
| 333 |
- Prompt: Citation-focused
|
| 334 |
|
| 335 |
Expected performance:
|
| 336 |
-
- Response time: ~
|
| 337 |
- LLM Judge Score: 55.67%
|
| 338 |
- Citation Score: 73.33%
|
| 339 |
|
|
@@ -421,8 +388,8 @@ async def llm_endpoint(request: Request):
|
|
| 421 |
response_time=0.0
|
| 422 |
)
|
| 423 |
|
| 424 |
-
# Retrieve relevant documents (
|
| 425 |
-
documents = retrieve_documents(query, top_k=
|
| 426 |
|
| 427 |
# Generate answer
|
| 428 |
answer, response_time = generate_answer(
|
|
|
|
| 27 |
from dotenv import load_dotenv
|
| 28 |
from openai import AzureOpenAI
|
| 29 |
from pinecone import Pinecone
|
| 30 |
+
from sentence_transformers import SentenceTransformer
|
| 31 |
|
| 32 |
# Load environment variables
|
| 33 |
load_dotenv()
|
|
|
|
| 89 |
# Initialize clients (lazy loading for faster startup)
|
| 90 |
azure_client = None
|
| 91 |
pinecone_index = None
|
| 92 |
+
embedding_model = None
|
| 93 |
|
| 94 |
|
| 95 |
def get_azure_client():
|
|
|
|
| 113 |
return pinecone_index
|
| 114 |
|
| 115 |
|
| 116 |
+
def get_embedding_model():
|
| 117 |
+
"""Lazy load local embedding model (same as ingestion: BAAI/bge-large-en-v1.5)"""
|
| 118 |
+
global embedding_model
|
| 119 |
+
if embedding_model is None:
|
| 120 |
+
print("Loading BAAI/bge-large-en-v1.5 embedding model...")
|
| 121 |
+
embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5")
|
| 122 |
+
print("✅ Embedding model loaded")
|
| 123 |
+
return embedding_model
|
| 124 |
+
|
| 125 |
+
|
| 126 |
def get_embedding(text: str) -> List[float]:
|
| 127 |
"""
|
| 128 |
Generate embedding for semantic search.
|
| 129 |
|
| 130 |
+
Uses BAAI/bge-large-en-v1.5 (same as document ingestion) for consistent embeddings.
|
| 131 |
+
Returns 1024-dimensional vector matching Pinecone index.
|
| 132 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
try:
|
| 134 |
+
model = get_embedding_model()
|
| 135 |
+
embedding = model.encode(text).tolist()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
return embedding
|
| 137 |
except Exception as e:
|
| 138 |
+
print(f"Embedding error: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
# Return zero vector (will not match documents, but API won't crash)
|
| 140 |
+
return [0.0] * 1024
|
| 141 |
|
| 142 |
|
| 143 |
# Request/Response models
|
|
|
|
| 171 |
response_time: float
|
| 172 |
|
| 173 |
|
| 174 |
+
def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:
|
| 175 |
"""
|
| 176 |
Retrieve relevant documents from Pinecone vector database.
|
| 177 |
+
Best strategy from benchmark: vanilla top-3 with BAAI/bge-large-en-v1.5
|
| 178 |
|
| 179 |
+
Uses BAAI/bge-large-en-v1.5 embeddings (1024-dim, same as ingestion).
|
| 180 |
"""
|
| 181 |
index = get_pinecone_index()
|
| 182 |
|
|
|
|
| 294 |
LLM chatbot endpoint for SOCAR historical documents.
|
| 295 |
|
| 296 |
Uses RAG (Retrieval Augmented Generation) with:
|
| 297 |
+
- Embedding: BAAI/bge-large-en-v1.5 @ 1024-dim (local model)
|
| 298 |
- Retrieval: Top-3 documents (Pinecone)
|
| 299 |
- LLM: Llama-4-Maverick-17B (open-source)
|
| 300 |
- Prompt: Citation-focused
|
| 301 |
|
| 302 |
Expected performance:
|
| 303 |
+
- Response time: ~3.6s
|
| 304 |
- LLM Judge Score: 55.67%
|
| 305 |
- Citation Score: 73.33%
|
| 306 |
|
|
|
|
| 388 |
response_time=0.0
|
| 389 |
)
|
| 390 |
|
| 391 |
+
# Retrieve relevant documents (top-3 is optimal per benchmarks)
|
| 392 |
+
documents = retrieve_documents(query, top_k=3)
|
| 393 |
|
| 394 |
# Generate answer
|
| 395 |
answer, response_time = generate_answer(
|