CarsRUS / rag_engine.py
galbendavids's picture
CarsRUS: session car history โ€“ stick to discussed models on follow-up
cdf8db8
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
import os
import re
import requests
from collections import defaultdict
from typing import List, Dict, Tuple, Optional
import time
import hashlib
import random
import threading
import logging
# Pipeline logging: visible in HF Spaces logs and helps debug RAG flow
PIPELINE_LOG = logging.getLogger("pipeline")
if not PIPELINE_LOG.handlers:
h = logging.StreamHandler()
h.setFormatter(logging.Formatter("[PIPELINE] %(message)s"))
PIPELINE_LOG.addHandler(h)
PIPELINE_LOG.setLevel(logging.INFO)
class RAGEngine:
def __init__(self, data_path=None):
print("Initializing RAG Engine with Advanced Features...")
if data_path is None:
base_dir = os.path.dirname(__file__)
self.data_path = os.path.join(base_dir, "data_ingestion", "scraped_data.json")
else:
self.data_path = data_path
print(f"Using data path: {self.data_path}")
# Encoder loaded at startup (before building index). No lazy load.
self.encoder = None
self._encoder_model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
# Initialize advanced features
self.chunks = []
self.chunk_metadata = [] # ืขืฆื” 2: ืžื˜ื-ื“ืื˜ื” ืขืฉื™ืจื”
self.embeddings = None
self.keyword_index = {} # ืขืฆื” 3: ื—ื™ืคื•ืฉ ืžื™ืœื•ืช ืžืคืชื—
self.car_normalization = self._build_car_normalization() # ืขืฆื” 4: ื ืจืžื•ืœ ืฉืžื•ืช
# Regex patterns for robust cold name detection
self._build_regex_patterns()
self.conversation_history = [] # ืขืฆื” 10: ื”ื™ืกื˜ื•ืจื™ื™ืช ืฉื™ื—ื”
# Rate limiting and caching
self.response_cache = {} # Cache for identical queries
self.last_request_time = 0 # Track last API request time
# Minimum delay between requests (seconds). Can be tuned via env.
# Minimal delay; rate-limit backoff handles 429s.
self.request_delay = float(os.environ.get("GEMINI_REQUEST_DELAY", "0.5"))
# Thread-safe throttling + shared cooldown across concurrent requests.
self._rate_limit_lock = threading.Lock()
self._rate_limited_until = 0.0
# Speed/size knobs (reduce prompt size + generation time)
self.max_chunks_general = int(os.environ.get("RAG_TOP_K_GENERAL", "4"))
self.max_chunks_comparison = int(os.environ.get("RAG_TOP_K_COMPARISON", "6"))
self.max_context_chars_per_chunk = int(os.environ.get("RAG_CONTEXT_CHARS_PER_CHUNK", "280"))
self.max_output_tokens = int(os.environ.get("GEMINI_MAX_OUTPUT_TOKENS", "600"))
self._api_timeout_seconds = int(os.environ.get("GEMINI_API_TIMEOUT", "45"))
self._generation_config = {
"max_output_tokens": self.max_output_tokens,
"temperature": 0.4,
}
self._load_and_process_data()
# Build or load chunk embeddings. Encoder is loaded only when building (no cache); otherwise lazy on first search.
self._build_index()
print("RAG Engine Initialized (encoder + all embeddings ready at startup).")
def _build_car_normalization(self) -> Dict[str, str]:
"""ืขืฆื” 4: ืžื™ืœื•ืŸ ื ืจืžื•ืœ ืฉืžื•ืช ืจื›ื‘ื™ื (ืขื‘ืจื™ืช + ืื ื’ืœื™ืช)"""
return {
# ื˜ื•ื™ื•ื˜ื” ืงื•ืจื•ืœื”
'ืงื•ืจื•ืœื”': 'toyota_corolla',
'toyota corolla': 'toyota_corolla',
'ื˜ื•ื™ื•ื˜ื” ืงื•ืจื•ืœื”': 'toyota_corolla',
'corolla': 'toyota_corolla',
# ืกื™ื˜ืจื•ืืŸ C3
'c3': 'citroen_c3',
'citroen c3': 'citroen_c3',
'ืกื™ื˜ืจื•ืืŸ c3': 'citroen_c3',
'c3 ื”ื—ื“ืฉื”': 'citroen_c3',
# ืื•ื“ื™ RS3
'rs3': 'audi_rs3',
'audi rs3': 'audi_rs3',
'ืื•ื“ื™ rs3': 'audi_rs3',
# ืงื™ื” EV9
'ev9': 'kia_ev9',
'kia ev9': 'kia_ev9',
'ืงื™ื” ev9': 'kia_ev9',
# MG S6
's6': 'mg_s6',
'mg s6': 'mg_s6',
'mg-s6': 'mg_s6',
# ื™ื•ื ื“ืื™ ืืœื ื˜ืจื” N
'elantra n': 'hyundai_elantra_n',
'ืืœื ื˜ืจื” n': 'hyundai_elantra_n',
'elantra': 'hyundai_elantra_n',
# ืื™ื•ืŸ HT
'aion ht': 'aion_ht',
'ht': 'aion_ht',
'ืื™ื•ืŸ ht': 'aion_ht',
# ื’'ื ืกื™ืก GV80
'genesis gv80': 'genesis_gv80',
'gv80': 'genesis_gv80',
"ื’'ื ืกื™ืก gv80": 'genesis_gv80',
# Link & Co 01 (support "and", "&", Hebrew ืœื™ื ืง ืื ื“/& ืงื•)
'link & co 01': 'link_co_01',
'link co 01': 'link_co_01',
'link and co 01': 'link_co_01',
'link and co': 'link_co_01',
"ืœื™ื ืง ืื ื“ ืงื• 01": 'link_co_01',
"ืœื™ื ืง ืื ื“ ืงื•": 'link_co_01',
"ืœื™ื ืง & ืงื• 01": 'link_co_01',
"ืœื™ื ืง & ืงื•": 'link_co_01',
}
def _chunk_by_topic(self, text: str, title: str, url: str) -> List[Dict]:
"""ืขืฆื” 1: ื—ืœื•ืงืช ืžื™ื“ืข ืœื—ืชื™ื›ื•ืช ืœืคื™ ื ื•ืฉืื™ื"""
chunks_list = []
# ื ื•ืฉืื™ื ืขื™ืงืจื™ื™ื ืœื—ืชื•ืš
topics = {
'ืžืคืจื˜ ื˜ื›ื ื™|ื‘ื™ืฆื•ืขื™ื|ืžื ื•ืข|ืชื™ื‘ื”|ื”ื ืขื”': 'technical_specs',
'ื‘ื˜ื™ื—ื•ืช|ืžืขืจื›ื•ืช ื‘ื˜ื™ื—ื•ืช|ื‘ืœืžื™ื': 'safety',
'ืžื—ื™ืจ|ืžื—ื™ืจื”': 'price',
'ืขื™ืฆื•ื‘|ืžืจืื”|ื—ื™ืฆื•ื ื™': 'design',
'ื ื•ื—ื•ืช|ื”ืชื ื”ืœื•ืช|ืžืฉื‘ื•ืฉื™ื': 'comfort',
'ืฆืจื™ื›ื”|ื˜ื•ื•ื—|ื˜ืขื™ื ื”': 'efficiency',
'ื“ื™ื ืžื™ืงื”|ื”ื’ื”|ื‘ื™ืฆื•ืขื™ื ื“ื™ื ืžื™ื™ื': 'dynamic',
}
# ื—ืœื•ืงื” ื‘ืกื™ืกื™ืช ืœืคืกืงืื•ืช
paragraphs = text.split('\n')
current_chunk = []
current_topic = 'general'
for para in paragraphs:
if len(para.strip()) < 20:
continue
# ื–ื™ื”ื•ื™ ื ื•ืฉื
for pattern, topic in topics.items():
if re.search(pattern, para, re.IGNORECASE):
if current_chunk and current_topic != topic:
chunk_text = '\n'.join(current_chunk)
if len(chunk_text) > 50:
chunks_list.append({
'text': chunk_text,
'topic': current_topic,
'title': title,
'url': url
})
current_chunk = []
current_topic = topic
break
current_chunk.append(para)
# ื”ื•ืกืคืช ื”ื ืชื•ืŸ ื”ืื—ืจื•ืŸ
if current_chunk:
chunk_text = '\n'.join(current_chunk)
if len(chunk_text) > 50:
chunks_list.append({
'text': chunk_text,
'topic': current_topic,
'title': title,
'url': url
})
return chunks_list
def _normalize_car_name(self, text: str) -> str:
"""ืขืฆื” 4: ื ืจืžื•ืœ ืฉืžื•ืช ืจื›ื‘ื™ื ื‘ื˜ืงืกื˜
Returns canonical id (e.g. 'audi_rs3') if matched, else returns None.
Uses regex_patterns first, then falls back to simple variation map.
"""
if not text:
return None
txt = text.lower()
# Try regex patterns (robust, multilingual, handles spaces/hyphens)
for pattern, canonical in getattr(self, 'regex_patterns', {}).items():
try:
if re.search(pattern, txt):
return canonical
except re.error:
# Skip invalid patterns (shouldn't happen)
continue
# Fallback: match known variants as whole words
for variant, canonical in self.car_normalization.items():
if re.search(rf"\b{re.escape(variant.lower())}\b", txt):
return canonical
return None
def _build_regex_patterns(self):
"""ื‘ื ื” ืชื‘ื ื™ื•ืช ืจื’ืงืก ื—ื›ืžื•ืช ืœื–ื™ื”ื•ื™ ืฉืžื•ืช ืจื›ื‘ื™ื (ื›ื•ืœืœ ืขื‘ืจื™ืช โ€“ ื‘ืœื™ \\b ืขืœ ืขื‘ืจื™ืช)"""
# Patterns: English use \\b; Hebrew has no word boundary (ืืœื ื˜ืจื” can appear as "ืœืืœื ื˜ืจื”")
self.regex_patterns = {
r'\baudi[\s\-]*rs\s*3\b': 'audi_rs3',
r'\bcitroen[\s\-]*c\s*3\b': 'citroen_c3',
r'\bc\s*3\b': 'citroen_c3',
r'\bkia[\s\-]*ev\s*9\b': 'kia_ev9',
r'\bev\s*9\b': 'kia_ev9',
r'\bhyundai[\s\-]*elantra\s*n\b': 'hyundai_elantra_n',
r'\belantra\s*n\b': 'hyundai_elantra_n',
r'ืืœื ื˜ืจื”\s*[nN]?': 'hyundai_elantra_n', # Hebrew: "ืืœื ื˜ืจื”" or "ืืœื ื˜ืจื” N" (no \\b โ€“ "ืœืืœื ื˜ืจื”" ok)
r'\baion\s*ht\b': 'aion_ht',
r'\bgenesis[\s\-]*gv\s*80\b': 'genesis_gv80',
r'\bgv\s*80\b': 'genesis_gv80',
r'\blink\s*(?:&|and)\s*co\.?\s*01\b': 'link_co_01',
r'\blink\s*&?\s*co\s*01\b': 'link_co_01',
r'ืœื™ื ืง\s*(?:&|ืื ื“)\s*ืงื•\s*01?': 'link_co_01', # Hebrew: ืœื™ื ืง ืื ื“/& ืงื• 01
r'\brs\s*3\b': 'audi_rs3',
r'\bcorolla\b': 'toyota_corolla',
}
def _extract_keywords(self, text: str) -> List[str]:
"""ืขืฆื” 3: ื—ื™ืœื•ืฅ ืžื™ืœื•ืช ืžืคืชื—"""
# ื—ื™ืœื•ืฅ ืžื™ืœื•ืช ืขื ืžืฉืžืขื•ืช (ื‘ืขื‘ืจื™ืช ื•ืื ื’ืœื™ืช)
keywords = []
# ื“ื•ื’ืžืื•ืช ืฉืœ ืžื™ืœื•ืช ืžืคืชื— ื—ืฉื•ื‘ื•ืช
important_words = [
r'\b\d+\s*ื›"ืก\b', # ืงื•ื—
r'\b\d+\s*ืงืž"ืฉ\b', # ืงืž"ืฉ
r'\b\d+\.?\d*\s*ืฉื ื™ื•ืช?\b', # ื–ืžืŸ ื”ืืฆื”
r'\b\d+\s*ืœื™ื˜ืจ\b', # ื ืคื—
r'ื˜ื•ืจื‘ื•|ื”ื™ื‘ืจื™ื“ื™|ื—ืฉืžืœื™|ื“ื•-ืžืฆืžื“ื™ืช|ื™ื“ื ื™ืช', # ืกื•ื’ื™ ื”ื ืขื”
r'ืžื ื•ืข|ื‘ื˜ื™ื—ื•ืช|ื ื•ื—ื•ืช|ืขื™ืฆื•ื‘|ืžื—ื™ืจ', # ืงื˜ื’ื•ืจื™ื•ืช
]
for pattern in important_words:
matches = re.findall(pattern, text, re.IGNORECASE)
keywords.extend(matches)
return list(set(keywords))
def _load_and_process_data(self):
"""ืขืฆื•ืช 1+2: ื˜ืขื™ื ื” ื•ื—ืœื•ืงื” ื—ื›ืžื” ืขื ืžื˜ื-ื“ืื˜ื”"""
with open(self.data_path, 'r', encoding='utf-8') as f:
raw_data = json.load(f)
self.chunks = []
self.chunk_metadata = []
for article in raw_data:
text = article['content']
url = article['url']
title = article['title']
# ื ืจืžื•ืœ ืฉืžื•ืช ืจื›ื‘ื™ื ื‘ื˜ืงืกื˜ (ืฉืžืจื• ืืช ื”ืชื•ืฆืื” ื‘ื ืคืจื“)
normalized_car = self._normalize_car_name(text)
# ื—ืœื•ืงื” ื—ื›ืžื” ืœืคื™ ื ื•ืฉืื™ื - ื”ืฉืชืžืฉื• ื‘ื˜ืงืกื˜ ื”ืžืœื (ืœื ื‘ืขืจืš ื”ืžื ื•ืจืžืœ)
topic_chunks = self._chunk_by_topic(text, title, url)
for chunk_data in topic_chunks:
chunk_text = chunk_data['text']
# ืขืฆื” 2: ืžื˜ื-ื“ืื˜ื” ืขืฉื™ืจื” ืœื›ืœ ื—ืชื™ื›ื”
keywords = self._extract_keywords(chunk_text)
metadata = {
"title": chunk_data['title'],
"url": chunk_data['url'],
"topic": chunk_data['topic'],
"keywords": keywords,
"publish_date": "2024-2025", # ืžื™ื“ืข ืคืจืกื•ื
"car_type": self._extract_car_type(chunk_data['title']),
"length": len(chunk_text)
}
self.chunks.append(chunk_text)
self.chunk_metadata.append(metadata)
# ืขืฆื” 3: ื‘ื ื™ื™ื” ืฉืœ ืื™ื ื“ืงืก ืžื™ืœื•ืช ืžืคืชื—
for keyword in keywords:
if keyword not in self.keyword_index:
self.keyword_index[keyword] = []
self.keyword_index[keyword].append(len(self.chunks) - 1)
print(f"Created {len(self.chunks)} smart chunks from {len(raw_data)} articles with rich metadata.")
def _extract_car_type(self, title: str) -> str:
"""ื–ื™ื”ื•ื™ ืกื•ื’ ื”ืจื›ื‘"""
types_map = {
'C3': 'supermini',
'RS3': 'compact',
'EV9': 'suv',
'S6': 'suv',
'ืืœื ื˜ืจื”': 'sedan',
'Elantra': 'sedan',
'HT': 'suv',
'ืœื™ื ืง': 'compact', # Link & Co 01
'01': 'compact', # Link & Co 01 (title contains "01")
}
for key, type_val in types_map.items():
if key in title:
return type_val
return 'unknown'
def _get_encoder(self):
"""Load encoder once (called at startup before _build_index)."""
if self.encoder is None:
print("Loading embedding model...")
self.encoder = SentenceTransformer(self._encoder_model_name)
print("Embedding model loaded.")
return self.encoder
def _embeddings_path(self) -> Tuple[str, str]:
"""Path to saved embeddings and meta (same dir as scraped_data.json)."""
data_dir = os.path.dirname(self.data_path)
return (
os.path.join(data_dir, "chunk_embeddings.npy"),
os.path.join(data_dir, "chunk_embeddings_meta.json"),
)
def _build_index(self):
"""Build or load chunk vectors. Saves to disk so next startup loads from file (no encoder over chunks)."""
if self.embeddings is not None:
return
emb_path, meta_path = self._embeddings_path()
n_chunks = len(self.chunks)
# Try load existing vectors (only if chunk count matches)
if os.path.isfile(emb_path) and os.path.isfile(meta_path):
try:
with open(meta_path, "r", encoding="utf-8") as f:
meta = json.load(f)
if meta.get("n_chunks") == n_chunks and meta.get("model") == self._encoder_model_name:
self.embeddings = np.load(emb_path)
if self.embeddings.shape[0] == n_chunks:
print(f"Loaded {n_chunks} embeddings from {emb_path}")
return
except Exception as e:
print(f"Could not load saved embeddings: {e}. Rebuilding...")
# Build and save
print("Building chunk embeddings...")
encoder = self._get_encoder()
self.embeddings = encoder.encode(self.chunks, batch_size=32)
norm = np.linalg.norm(self.embeddings, axis=1, keepdims=True)
self.embeddings = self.embeddings / norm
os.makedirs(os.path.dirname(emb_path) or ".", exist_ok=True)
np.save(emb_path, self.embeddings)
with open(meta_path, "w", encoding="utf-8") as f:
json.dump({"n_chunks": n_chunks, "model": self._encoder_model_name}, f, indent=0)
print(f"Saved {n_chunks} embeddings to {emb_path}")
def _hybrid_search(self, query: str, top_k: int = 5) -> List[Dict]:
"""Hybrid search: prebuilt chunk vectors + keyword index. Only the query is embedded at runtime."""
# Embeddings are built at startup; this is a no-op if already built.
self._build_index()
# ื ืจืžื•ืœ ื”ืฉืื™ืœืชื”
normalized_query = self._normalize_car_name(query)
# ืื ื”ื ืจืžื•ืœ ืœื ืžืฆื canonical id, ื”ืฉืชืžืฉ ื‘ืฉืื™ืœืชื” ื”ืžืงื•ืจื™ืช
if normalized_query is None:
normalized_query = query
# ื—ื™ืคื•ืฉ ื•ืงื˜ื•ืจื™
# Ensure we pass a string to the encoder
query_text_for_embedding = normalized_query if isinstance(normalized_query, str) else str(normalized_query)
encoder = self._get_encoder()
query_embedding = encoder.encode([query_text_for_embedding])
query_embedding = query_embedding / np.linalg.norm(query_embedding)
scores = np.dot(self.embeddings, query_embedding.T).flatten()
# ื—ื™ืคื•ืฉ ืžื™ืœื•ืช ืžืคืชื—
keywords = self._extract_keywords(normalized_query)
keyword_matches = set()
for keyword in keywords:
if keyword in self.keyword_index:
keyword_matches.update(self.keyword_index[keyword])
# ืฉื™ืœื•ื‘ ื”ืชื•ืฆืื•ืช
combined_scores = scores.copy()
for idx in keyword_matches:
combined_scores[idx] += 0.3 # ื‘ื•ื ื•ืก ืœืžืฉื—ืงื™ ืžื™ืœื•ืช ืžืคืชื—
# ื‘ื—ื™ืจืช Top K
top_indices = np.argsort(combined_scores)[-top_k:][::-1]
results = []
for idx in top_indices:
results.append({
"text": self.chunks[idx],
"metadata": self.chunk_metadata[idx],
"score": float(combined_scores[idx])
})
return results
def retrieve(self, query: str, top_k: int = 5):
"""ืขืฆื” 3: ื—ื™ืคื•ืฉ ื”ื™ื‘ืจื™ื“ื™ ื‘ืžืงื•ื ืจืง ื•ืงื˜ื•ืจื™"""
return self._hybrid_search(query, top_k)
def _extract_comparison_data(self, car1: str, car2: str) -> Dict:
"""ืขืฆื” 5: ื—ื™ืœื•ืฅ ื ืชื•ื ื™ื ืžื•ื‘ื ื™ื ืœื”ืฉื•ื•ืื”"""
specs_map = {
'power': r'(\d+)\s*ื›"ืก',
'torque': r'(\d+\.?\d*)\s*ืงื’"ืž',
'acceleration': r'(\d+\.?\d*)\s*ืฉื ื™ื•ืช?\s*ืœ-?100',
'top_speed': r'(\d+)\s*ืงืž"ืฉ',
'consumption': r'(\d+\.?\d*)\s*ืง"ืž/l',
}
# ื—ื™ืคื•ืฉ ื ืชื•ื ื™ื ืขื‘ื•ืจ ื›ืœ ืจื›ื‘
car1_data = {}
car2_data = {}
results1 = self._hybrid_search(car1, top_k=10)
results2 = self._hybrid_search(car2, top_k=10)
for result in results1:
for spec, pattern in specs_map.items():
match = re.search(pattern, result['text'])
if match and spec not in car1_data:
car1_data[spec] = match.group(1)
for result in results2:
for spec, pattern in specs_map.items():
match = re.search(pattern, result['text'])
if match and spec not in car2_data:
car2_data[spec] = match.group(1)
return {
"car1": {"name": car1, "specs": car1_data},
"car2": {"name": car2, "specs": car2_data}
}
def _is_comparison_question(self, query: str) -> bool:
"""Rule-based only (regex/keywords). No LLM. Detects comparison vs single-model questions."""
if not query:
return False
q = query.lower()
comparison_keywords = [
'ืžื” ื™ื•ืชืจ ื˜ื•ื‘', 'ื”ืฉื•ื•ืื”', 'ืœืขื•ืžืช', 'vs', 'versus', 'compare', 'better than',
'ื‘ื™ืŸ ', ' ื”ืฉื•ื•ืื”', 'ืœื”ืฉื•ื•ืช', 'compare between',
]
if any(k in q for k in comparison_keywords):
return True
# "X vs Y" or "X ื•-Y" / "X and Y" with two model-like tokens
if re.search(r'\bvs\b|\bversus\b| ื• | and | versus ', q, re.IGNORECASE):
return True
return False
def _maintain_conversation_history(self, query: str, response: str, max_turns: int = 5):
"""ืขืฆื” 10: ื ื™ื”ื•ืœ ื”ื™ืกื˜ื•ืจื™ื™ืช ืฉื™ื—ื” ื—ื›ืžื”"""
self.conversation_history.append({
"query": query,
"response": response
})
# ืฉืžื™ืจืช ืจืง 5 ืชื•ืจื•ืช ืื—ืจื•ื ื•ืช
if len(self.conversation_history) > max_turns:
self.conversation_history = self.conversation_history[-max_turns:]
def _get_context_from_history(self) -> str:
"""ื—ื™ืœื•ืฅ ื”ืงืฉืจ ืžื”ื™ืกื˜ื•ืจื™ื™ืช ื”ืฉื™ื—ื” โ€“ Q ื•-A ื›ื“ื™ ืฉื”ืžื•ื“ืœ ื™ื”ื™ื” ืžื•ื“ืข ืœืฉื™ื—ื” ื‘ื”ืžืฉืš"""
if not self.conversation_history:
return ""
context_lines = []
for turn in self.conversation_history[-3:]: # 3 ืชื•ืจื•ืช ืื—ืจื•ื ื•ืช
q = (turn.get("query") or "")[:200]
a = (turn.get("response") or "")[:300]
context_lines.append(f"Q: {q}\nA: {a}")
return "\n\n".join(context_lines)
def _get_mentioned_cars_in_conversation(self, max_turns: int = 5) -> set:
"""ื“ื’ืžื™ื ืฉืžื•ืคื™ืขื™ื ื‘ื”ื™ืกื˜ื•ืจื™ื™ืช ื”ืฉื™ื—ื” ื”ื ื•ื›ื—ื™ืช (ืฉืืœื•ืช + ืชืฉื•ื‘ื•ืช) โ€“ ื›ื“ื™ ืœื“ื‘ื•ืง ื‘ื”ื ื‘ึพfollow-up."""
mentioned = set()
if not self.conversation_history:
return mentioned
for turn in self.conversation_history[-max_turns:]:
for key in ("query", "response"):
text = (turn.get(key) or "")[:1500]
mentioned.update(self._find_supported_canonicals_in_text(text))
return mentioned
def _get_cache_key(self, query: str) -> str:
"""Generate cache key for query"""
return hashlib.md5(query.lower().encode()).hexdigest()
@staticmethod
def _extract_retry_after_seconds(error_text: str) -> Optional[int]:
"""Best-effort parse of Retry-After seconds from an error message."""
if not error_text:
return None
# Common patterns: "Retry-After: 60", "retry_after: 60", "Retry after 60s"
m = re.search(r"retry[-_\s]*after[:\s]+(\d+)", error_text, re.IGNORECASE)
if m:
try:
return int(m.group(1))
except Exception:
return None
m = re.search(r"retry\s+after\s+(\d+)\s*s", error_text, re.IGNORECASE)
if m:
try:
return int(m.group(1))
except Exception:
return None
return None
@staticmethod
def _is_hebrew(text: str) -> bool:
return bool(re.search(r"[\u0590-\u05FF]", text or ""))
# Canonical id -> display name for supported models only (allowed to recommend/compare).
CANONICAL_TO_DISPLAY = {
"citroen_c3": "Citroen C3",
"audi_rs3": "Audi RS3",
"kia_ev9": "Kia EV9",
"mg_s6": "MG S6",
"hyundai_elantra_n": "Hyundai Elantra N",
"aion_ht": "Aion HT",
"genesis_gv80": "Genesis GV80",
"link_co_01": "Link & Co 01",
}
@classmethod
def _supported_cars_display(cls) -> List[str]:
"""
The only car models this app is allowed to recommend/compare.
Must correspond to articles that exist in `data_ingestion/scraped_data.json`.
"""
return list(cls.CANONICAL_TO_DISPLAY.values())
def _find_supported_canonicals_in_text(self, text: str) -> set:
"""Find all supported canonical car ids mentioned in the text (only canonicals we allow)."""
found = set()
if not text:
return found
txt = text.lower()
allowed = set(self.CANONICAL_TO_DISPLAY.keys())
# Regex patterns
for pattern, canonical in getattr(self, "regex_patterns", {}).items():
try:
if canonical in allowed and re.search(pattern, txt):
found.add(canonical)
except re.error:
continue
# Variant map
for variant, canonical in self.car_normalization.items():
if canonical in allowed and re.search(rf"\b{re.escape(variant.lower())}\b", txt):
found.add(canonical)
return found
def _get_ordered_supported_canonicals_in_text(self, text: str) -> List[str]:
"""Return supported canonicals mentioned in text, in order of first appearance."""
if not text:
return []
txt = text.lower()
allowed = set(self.CANONICAL_TO_DISPLAY.keys())
# canonical -> earliest start position
positions: Dict[str, int] = {}
for pattern, canonical in getattr(self, "regex_patterns", {}).items():
if canonical not in allowed:
continue
try:
m = re.search(pattern, txt)
if m:
pos = m.start()
if canonical not in positions or pos < positions[canonical]:
positions[canonical] = pos
except re.error:
continue
for variant, canonical in self.car_normalization.items():
if canonical not in allowed:
continue
m = re.search(rf"\b{re.escape(variant.lower())}\b", txt)
if m:
pos = m.start()
if canonical not in positions or pos < positions[canonical]:
positions[canonical] = pos
return [c for c in sorted(positions.keys(), key=lambda c: positions[c])]
@staticmethod
def _looks_like_specific_car_question(text: str) -> bool:
"""
Heuristic: decide if user likely asks about a specific car model,
not a general concept question.
"""
if not text:
return False
t = text.lower()
# comparison markers
if re.search(r"\b(vs|versus|compare)\b", t) or any(k in t for k in ["ื”ืฉื•ื•ืื”", "ืœืขื•ืžืช", "ืžื” ื™ื•ืชืจ ื˜ื•ื‘", "ื‘ื™ืŸ"]):
return True
# common โ€œtell me about modelโ€ phrasing
if any(k in t for k in ["tell me about", "what do you think", "review", "ืžื‘ื—ืŸ", "ื“ืขื” ืขืœ", "ืžื” ื“ืขืชืš", "ืกืคืจ ืœื™ ืขืœ", "ืชืกืคืจ ืœื™ ืขืœ"]):
return True
# model-like token patterns (letters+digits, e.g. rs3, ev9, x5)
if re.search(r"\b[a-z]{1,}\s*\d{1,}\b", t) or re.search(r"\b\d{1,}\s*[a-z]{1,}\b", t) or re.search(r"\b[a-z]{2,}\d{1,}\b", t):
return True
return False
def _unsupported_car_refusal(self, query: str, is_comparison: bool) -> str:
supported = ", ".join(self._supported_cars_display())
if self._is_hebrew(query):
if is_comparison:
return (
"โŒ ืื ื™ ื™ื›ื•ืœ ืœื”ืฉื•ื•ืช/ืœื”ืžืœื™ืฅ **ืจืง ืขืœ ื‘ืกื™ืก ืžื™ื“ืข ืฉืงื™ื™ื ืืฆืœื™** ืžืชื•ืš ื›ืชื‘ื•ืช ืžึพ`auto.co.il`.\n"
"ื ืจืื” ืฉืœืคื—ื•ืช ืื—ื“ ืžื”ื“ื’ืžื™ื ืฉื‘ื™ืงืฉืช **ืœื ื ืžืฆื ื‘ื‘ืกื™ืก ื”ื™ื“ืข ืฉืœื™**, ื•ืœื›ืŸ ืืกื•ืจ ืœื™ ืœื”ืžืœื™ืฅ ืขืœื™ื• ืื• ืœื”ืฉื•ื•ืช ืื•ืชื•.\n\n"
f"โœ… ื“ื’ืžื™ื ื ืชืžื›ื™ื ื›ืจื’ืข: {supported}\n"
"ืื ืชืจืฆื”, ื›ืชื•ื‘ ื”ืฉื•ื•ืื” ื‘ื™ืŸ ืฉื ื™ ื“ื’ืžื™ื ืžื”ืจืฉื™ืžื”."
)
return (
"โŒ ืื ื™ ื™ื›ื•ืœ ืœื”ืžืœื™ืฅ **ืจืง ืขืœ ื‘ืกื™ืก ืžื™ื“ืข ืฉืงื™ื™ื ืืฆืœื™** ืžืชื•ืš ื›ืชื‘ื•ืช ืžึพ`auto.co.il`.\n"
"ื”ื“ื’ื ืฉื‘ื™ืงืฉืช **ืœื ื ืžืฆื ื‘ื‘ืกื™ืก ื”ื™ื“ืข ืฉืœื™**, ื•ืœื›ืŸ ืืกื•ืจ ืœื™ ืœื”ืžืœื™ืฅ ืขืœื™ื•.\n\n"
f"โœ… ื“ื’ืžื™ื ื ืชืžื›ื™ื ื›ืจื’ืข: {supported}\n"
"ืื ืชื›ืชื•ื‘ ืื—ื“ ืžื”ื“ื’ืžื™ื ืžื”ืจืฉื™ืžื” โ€” ืืฉืžื— ืœืขื–ื•ืจ."
)
else:
if is_comparison:
return (
"โŒ I can compare/recommend **only using information I have** from articles scraped from `auto.co.il`.\n"
"At least one of the models you mentioned is **not in my knowledge base**, so Iโ€™m not allowed to recommend or compare it.\n\n"
f"โœ… Currently supported models: {supported}\n"
"If you want, ask for a comparison between two models from this list."
)
return (
"โŒ I can recommend **only using information I have** from articles scraped from `auto.co.il`.\n"
"The model you asked about is **not in my knowledge base**, so Iโ€™m not allowed to recommend it.\n\n"
f"โœ… Currently supported models: {supported}\n"
"Ask about one of these models and Iโ€™ll help."
)
def _wait_for_rate_limit(self):
"""Enforce minimum delay between API requests to avoid rate limiting"""
# Thread-safe: Gradio can execute requests concurrently.
with self._rate_limit_lock:
now = time.time()
# Honor global cooldown after a 429.
if now < self._rate_limited_until:
time.sleep(self._rate_limited_until - now)
now = time.time()
elapsed = now - self.last_request_time
if elapsed < self.request_delay:
time.sleep(self.request_delay - elapsed)
self.last_request_time = time.time()
def _get_openrouter_key(self) -> Optional[str]:
"""OpenRouter API key from env. HF Spaces: add Secret OPENROUTER_API_KEY. Local: .env openRouter_API_KEY."""
for name in ("OPENROUTER_API_KEY", "openRouter_API_KEY", "OPENROUTER_APIKEY", "OPENROUTER_KEY"):
v = os.environ.get(name)
if v and str(v).strip():
return str(v).strip()
return None
def _call_openrouter(self, system_prompt: str, prompt: str, timeout_seconds: int = 28) -> Optional[str]:
"""Call OpenRouter API for a fast response. Returns text or None on failure."""
key = self._get_openrouter_key()
if not key or not key.strip():
PIPELINE_LOG.info("OpenRouter key not set - add Secret OPENROUTER_API_KEY in HF Space settings. Using Gemini.")
return None
url = "https://openrouter.ai/api/v1/chat/completions"
# Prefer fast Gemini on OpenRouter (gemini-2.0-flash-exp:free was deprecated/404; use gemini-3-flash-preview)
model = os.environ.get("OPENROUTER_MODEL", "google/gemini-3-flash-preview")
payload = {
"model": model,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
"max_tokens": self.max_output_tokens,
"temperature": 0.4,
}
headers = {"Authorization": f"Bearer {key.strip()}", "Content-Type": "application/json"}
try:
self._wait_for_rate_limit()
PIPELINE_LOG.info("Calling OpenRouter model=%s timeout=%ds", model, timeout_seconds)
r = requests.post(url, json=payload, headers=headers, timeout=timeout_seconds)
r.raise_for_status()
data = r.json()
choices = data.get("choices") or []
if not choices:
return None
content = (choices[0].get("message") or {}).get("content") or ""
if not content or not str(content).strip():
return None
out = str(content).strip()
PIPELINE_LOG.info("OpenRouter response OK len=%d", len(out))
return out
except Exception as e:
PIPELINE_LOG.warning("OpenRouter failed: %s", str(e)[:120])
return None
def _call_api_with_backoff(self, system_prompt: str, prompt: str, models: List[str]):
"""When OPENROUTER_API_KEY is set: use only OpenRouter (no Gemini). Else: use Gemini with backoff."""
PIPELINE_LOG.info("_call_api_with_backoff START models=%s prompt_len=%d", models, len(prompt))
openrouter_key = self._get_openrouter_key()
if openrouter_key:
# Generation: use only OpenRouter when key is set (avoid Gemini rate limit)
PIPELINE_LOG.info("OpenRouter key present - using OpenRouter only for generation (no Gemini)")
for attempt in range(2):
result = self._call_openrouter(system_prompt, prompt, timeout_seconds=35)
if result:
return result
PIPELINE_LOG.warning("OpenRouter attempt %d failed, retrying...", attempt + 1)
return (
"โŒ OpenRouter request failed after retries. Check OPENROUTER_API_KEY and OPENROUTER_MODEL in Space secrets. "
"See logs for details."
)
# No OpenRouter key: use Gemini
PIPELINE_LOG.info("OpenRouter key not set - using Gemini for generation")
max_attempts_per_model = 8
max_rate_limit_wait_s = 180 # wait up to 3 minutes per attempt before retry
# Try each model
for model_idx, model in enumerate(models):
for attempt in range(max_attempts_per_model):
try:
# Wait before API call to respect rate limits
self._wait_for_rate_limit()
PIPELINE_LOG.info("Calling LLM model=%s attempt=%d", model, attempt + 1)
try:
model_obj = genai.GenerativeModel(model, system_instruction=system_prompt)
contents = prompt
except TypeError:
model_obj = genai.GenerativeModel(model)
contents = system_prompt + "\n\n" + prompt
response = model_obj.generate_content(
contents,
generation_config=self._generation_config,
)
text = getattr(response, "text", None) if response else None
if not (text and str(text).strip()):
PIPELINE_LOG.warning("LLM returned empty or no text")
return "โŒ The model returned no text (possibly blocked). Please try rephrasing."
out = str(text).strip()
PIPELINE_LOG.info("LLM response OK len=%d preview=%s", len(out), (out[:200] + "..." if len(out) > 200 else out))
return out
except Exception as e:
error_text = str(e)
error_msg = error_text.lower()
# Handle rate limit errors - wait longer and retry more
if "429" in error_msg or "rate" in error_msg or "quota" in error_msg or "too many" in error_msg:
retry_after = self._extract_retry_after_seconds(error_text) or 0
backoff = min(max_rate_limit_wait_s, 10 * (2 ** min(attempt, 5)))
jitter = random.uniform(0.0, 1.0)
wait_time = min(max_rate_limit_wait_s, max(retry_after, backoff) + jitter)
# Global cooldown so concurrent calls don't stampede.
with self._rate_limit_lock:
self._rate_limited_until = max(self._rate_limited_until, time.time() + wait_time)
print(f"โš ๏ธ Rate limited on {model}. Waiting {wait_time:.1f}s before retry ({attempt + 1}/{max_attempts_per_model})...")
time.sleep(wait_time)
# Retry same model unless attempts exhausted.
if attempt < max_attempts_per_model - 1:
continue
if model_idx < len(models) - 1:
print("โš ๏ธ Rate limit persists. Trying next model...")
break
msg = "โš ๏ธ API Rate Limit: ื”ืžืชื™ืŸ ื›ึพ2โ€“3 ื“ืงื•ืช ื•ื ืกื” ืฉื•ื‘. / Please wait 2โ€“3 minutes and try again."
PIPELINE_LOG.warning("_call_api_with_backoff returning rate-limit message")
return msg
# Handle 404 errors - model not available, try next one
elif "404" in error_msg or "not found" in error_msg or "not supported" in error_msg:
if model_idx < len(models) - 1:
print(f"โš ๏ธ Model {model} not available. Trying next model...")
time.sleep(2)
break # Move to next model
else:
PIPELINE_LOG.warning("_call_api_with_backoff no available models")
return f"โŒ No available models. Please try again later."
# Other errors - retry with same model once
else:
if attempt < 2:
sleep_s = 1.5 * (attempt + 1)
print(f"โš ๏ธ Error: {str(e)[:80]}. Retrying in {sleep_s:.1f}s...")
time.sleep(sleep_s)
continue
# If retry also failed, try next model
if model_idx < len(models) - 1:
print("โš ๏ธ Trying next model...")
break
PIPELINE_LOG.warning("_call_api_with_backoff error: %s", str(e)[:150])
return f"โŒ Error: {str(e)[:100]}"
PIPELINE_LOG.warning("_call_api_with_backoff exhausted all models, returning failure")
return "โŒ Failed to get response from API"
def _call_api_with_backoff_stream(self, system_prompt: str, prompt: str, models: List[str]):
"""
Streaming version: yields incremental text while generating.
Returns the final text via StopIteration.value (internal use) and also yields it progressively.
"""
max_attempts_per_model = 8
max_rate_limit_wait_s = 180
for model_idx, model in enumerate(models):
for attempt in range(max_attempts_per_model):
try:
self._wait_for_rate_limit()
try:
model_obj = genai.GenerativeModel(model, system_instruction=system_prompt)
contents = prompt
except TypeError:
model_obj = genai.GenerativeModel(model)
contents = system_prompt + "\n\n" + prompt
stream = model_obj.generate_content(
contents,
generation_config=self._generation_config,
stream=True,
)
acc = ""
start = time.time()
timeout = self._api_timeout_seconds
for chunk in stream:
if time.time() - start > timeout:
yield (acc + "\n\nโฑ๏ธ Request timed out. Partial response above. Try again or shorten the query.") if acc else "โฑ๏ธ Request timed out. Please try again."
return
piece = getattr(chunk, "text", "") or ""
if not piece:
continue
acc += piece
yield acc
return
except Exception as e:
error_text = str(e)
error_msg = error_text.lower()
if "429" in error_msg or "rate" in error_msg or "quota" in error_msg or "too many" in error_msg:
retry_after = self._extract_retry_after_seconds(error_text) or 0
backoff = min(max_rate_limit_wait_s, 10 * (2 ** min(attempt, 5)))
jitter = random.uniform(0.0, 1.0)
wait_time = min(max_rate_limit_wait_s, max(retry_after, backoff) + jitter)
with self._rate_limit_lock:
self._rate_limited_until = max(self._rate_limited_until, time.time() + wait_time)
time.sleep(wait_time)
if attempt < max_attempts_per_model - 1:
continue
if model_idx < len(models) - 1:
break
yield "โš ๏ธ API Rate Limit: ื”ืžืชื™ืŸ ื›ึพ2โ€“3 ื“ืงื•ืช ื•ื ืกื” ืฉื•ื‘. / Please wait 2โ€“3 minutes and try again."
return
if "404" in error_msg or "not found" in error_msg or "not supported" in error_msg:
if model_idx < len(models) - 1:
time.sleep(1.0)
break
yield "โŒ No available models. Please try again later."
return
if attempt < 2:
time.sleep(1.0 + attempt)
continue
if model_idx < len(models) - 1:
break
yield f"โŒ Error: {error_text[:120]}"
return
yield "โŒ Failed to get response from API"
def configure_api(self, api_key: str) -> None:
"""Configure Gemini API key (for use by external agent)."""
genai.configure(api_key=api_key)
def prepare_generation(self, query: str) -> Tuple[Optional[str], Optional[str], Optional[str], List[str]]:
"""
Run RAG pipeline up to (but not including) the LLM call.
Returns (refusal_message, system_prompt, user_prompt, steps_log).
If refusal_message is set, the other three are None / empty; otherwise use prompts for generation.
"""
PIPELINE_LOG.info("prepare_generation START query=%r", query[:80] if query else "")
steps_log: List[str] = []
steps_log.append("๐Ÿ” Normalizing car names...")
canonical = self._normalize_car_name(query)
ordered_supported = self._get_ordered_supported_canonicals_in_text(query)
current_query_cars = set(ordered_supported) if ordered_supported else set()
if canonical:
current_query_cars.add(canonical)
search_query = canonical
else:
search_query = query
# ื“ื’ืžื™ื ืฉื”ืžืฉืชืžืฉ ื›ื‘ืจ ื“ื™ื‘ืจ ืขืœื™ื”ื ื‘ืฉื™ื—ื” โ€“ ื‘ึพfollow-up ื ื“ื‘ื•ืง ืจืง ื‘ื”ื
mentioned_in_session = self._get_mentioned_cars_in_conversation(max_turns=5)
is_follow_up = (
len(mentioned_in_session) > 0
and (not current_query_cars or current_query_cars <= mentioned_in_session)
)
if is_follow_up:
steps_log.append(f"๐Ÿ“Œ Follow-up: ื“ื‘ืงื™ื ื‘ื“ื’ืžื™ ื”ืฉื™ื—ื” โ€“ {', '.join(self.CANONICAL_TO_DISPLAY.get(c, c) for c in sorted(mentioned_in_session))}")
is_comparison = self._is_comparison_question(query)
if is_comparison:
steps_log.append("๐Ÿ“‹ Detected: comparison question (rule-based)")
else:
steps_log.append("๐Ÿ“‹ Detected: single-model question (rule-based)")
ordered_supported = self._get_ordered_supported_canonicals_in_text(query)
# Show user which cars were identified (for comparison: both; for single: one)
if is_comparison:
if len(ordered_supported) >= 2:
names = ", ".join(self.CANONICAL_TO_DISPLAY.get(c, c) for c in ordered_supported[:2])
steps_log.append(f"โœ… ื–ื™ื”ื•ื™ ื“ื’ืžื™ื ืœื”ืฉื•ื•ืื”: {names}")
elif len(ordered_supported) == 1:
one_display = self.CANONICAL_TO_DISPLAY.get(ordered_supported[0], ordered_supported[0])
steps_log.append(f"โœ… ื–ื™ื”ื•ื™ ื“ื’ื ืื—ื“ (ื”ืฉื ื™ ืœื ื‘ืจืฉื™ืžื”): {one_display}")
else:
if canonical:
steps_log.append(f"โœ… Recognized canonical id: {canonical}")
elif not ordered_supported:
steps_log.append("โ„น๏ธ No canonical car found; using full query for search")
if is_comparison:
if len(ordered_supported) == 0:
refusal = self._unsupported_car_refusal(query, is_comparison=True)
PIPELINE_LOG.info("prepare_generation END refusal=True (comparison, no supported) steps=%d", len(steps_log))
return (refusal, None, None, steps_log)
else:
if not canonical and not ordered_supported and self._looks_like_specific_car_question(query):
refusal = self._unsupported_car_refusal(query, is_comparison=False)
PIPELINE_LOG.info("prepare_generation END refusal=True (single, unsupported) steps=%d", len(steps_log))
return (refusal, None, None, steps_log)
steps_log.append("๐Ÿ”Ž Searching knowledge base (vectors + keywords)...")
comparison_prompt = ""
context_results = []
if is_comparison:
if len(ordered_supported) >= 2:
car1_can, car2_can = ordered_supported[0], ordered_supported[1]
car1_display = self.CANONICAL_TO_DISPLAY.get(car1_can, car1_can)
car2_display = self.CANONICAL_TO_DISPLAY.get(car2_can, car2_can)
steps_log.append("๐Ÿ“Š Extracting structured comparison data (regex)...")
comparison_data = self._extract_comparison_data(car1_can, car2_can)
context_results = self._hybrid_search(search_query, top_k=self.max_chunks_comparison)
steps_log.append(f"โœ… Retrieved {len(context_results)} chunks for comparison")
comparison_prompt = f"""
Based on the car reviews, create a structured comparison between {car1_display} and {car2_display}:
Format your response as:
**ื™ืชืจื•ื ื•ืช {car1_display}:**
- [list advantages]
**ื™ืชืจื•ื ื•ืช {car2_display}:**
- [list advantages]
**ื”ืžืœืฆื” ืœืคื™ ืคืจื•ืคื™ืœ ืžืฉืชืžืฉ:**
- [personalized recommendation]
Structured Data:
{json.dumps(comparison_data, ensure_ascii=False, indent=2)}
Context from reviews:
"""
elif len(ordered_supported) == 1:
# One model in list, one or more not: can't compare but can tell about the one we know
one_can = ordered_supported[0]
one_display = self.CANONICAL_TO_DISPLAY.get(one_can, one_can)
steps_log.append(f"๐Ÿ“‹ One supported model ({one_display}); providing info only for it")
context_results = self._hybrid_search(one_can, top_k=self.max_chunks_general)
steps_log.append(f"โœ… Retrieved {len(context_results)} chunks")
if self._is_hebrew(query):
comparison_prompt = f"""
ื”ืžืฉืชืžืฉ ื‘ื™ืงืฉ ื”ืฉื•ื•ืื”. ืื—ื“ ื”ื“ื’ืžื™ื (ืื• ื™ื•ืชืจ) ืฉื”ื•ื ืฆื™ื™ืŸ **ืœื ื ืžืฆื ื‘ื‘ืกื™ืก ื”ื™ื“ืข ืฉืœื™** โ€“ ืื™ ืืคืฉืจ ืœื”ืฉื•ื•ืช ืœื“ื’ืžื™ื ืฉืœื ืœืžื“ืชื™ ืขืœื™ื”ื.
ื‘ืชืฉื•ื‘ืชืš: ืฆื™ื™ืŸ ื‘ืงืฆืจื” ืฉืื™ื ืš ื™ื›ื•ืœ ืœื”ืฉื•ื•ืช ืœื“ื’ืžื™ื ืฉืœื ืœืžื“ืช ืขืœื™ื”ื, ื•ืื– ืกืคืง ืžื™ื“ืข ืžืœื ืจืง ืขืœ ื”ื“ื’ื ืฉื›ืŸ ื ืžืฆื ื‘ืจืฉื™ืžื”: **{one_display}**, ื‘ื”ืชื‘ืกืก ืขืœ ื”ื”ืงืฉืจ ืœืžื˜ื”.
Context from reviews:
"""
else:
comparison_prompt = f"""
The user asked for a comparison. One or more models they mentioned are **not in my knowledge base** โ€“ I cannot compare to models I haven't learned about.
In your response: briefly state that you cannot compare to models you haven't learned about, then provide full information only about the model that is in my list: **{one_display}**, based on the context below.
Context from reviews:
"""
else:
context_results = self._hybrid_search(search_query, top_k=self.max_chunks_general)
comparison_prompt = "Answer in the same language as the user's question. "
steps_log.append(f"โœ… Retrieved {len(context_results)} chunks")
else:
# ื‘ึพfollow-up ืžื—ืคืฉื™ื ืจืง ื‘ื”ืงืฉืจ ืœื“ื’ืžื™ื ืฉื”ืžืฉืชืžืฉ ื›ื‘ืจ ื“ื™ื‘ืจ ืขืœื™ื”ื
retrieval_query = search_query
if is_follow_up and mentioned_in_session:
retrieval_query = " ".join(
self.CANONICAL_TO_DISPLAY.get(c, c) for c in sorted(mentioned_in_session)[:4]
)
context_results = self._hybrid_search(retrieval_query, top_k=self.max_chunks_general)
steps_log.append(f"โœ… Retrieved {len(context_results)} relevant chunks")
context_text = ""
for r in context_results:
meta = r['metadata']
context_text += f"""
Source: {meta['title']}
Topic: {meta['topic']}
Content: {r['text'][:self.max_context_chars_per_chunk]}...
"""
conversation_context = self._get_context_from_history()
session_models_instruction = ""
if is_follow_up and mentioned_in_session:
session_names = ", ".join(self.CANONICAL_TO_DISPLAY.get(c, c) for c in sorted(mentioned_in_session))
session_models_instruction = f"""
5. **Session context:** In this chat the user has been discussing only these models: {session_names}. For this follow-up question, answer ONLY in the context of these models. Do not introduce or recommend other models unless (a) the user explicitly asks to compare with another model, or (b) you are giving a brief tip like "ืื ืชืจืฆื” ื”ืฉื•ื•ืื” ืœื“ื’ื ืื—ืจ, ืืคืฉืจ ืœืฉืื•ืœ ืขืœ..." / "If you want to compare with another model, you can ask about...". Stick to the session models for the main answer.
"""
system_prompt = """You are an expert automotive assistant. Your reply in the chat MUST be a single, concrete verbal answer that the user will see directly.
Your task:
1. Use ONLY the "Context from car reviews" provided in the user message.
2. Combine everything you understood from the user's question with everything relevant you retrieved from the sources into ONE coherent, verbal answerโ€”as a car expert would say to a friend in the chat. Do not output raw snippets, bullet-only lists, or separate fragments; write a unified answer (paragraphs and/or structured sections) that directly answers the question.
3. Respond in the same language as the user (Hebrew or English). For comparison questions, provide a structured analysis with clear advantages for each vehicle, still in one cohesive reply.
4. If the context is empty or irrelevant, say clearly that you have no information from your knowledge base for this question.
""" + session_models_instruction + """
The user expects to see this single aggregated answer in the chatโ€”make it complete and concrete."""
user_prompt = f"""Context from car reviews:
{context_text if context_text.strip() else "(No matching chunks found.)"}
Previous conversation context (last turns):
{conversation_context}
User question: {query}
{comparison_prompt}
Synthesize the context above into one clear, verbal answer that aggregates all relevant information and directly answers the user's question. Your entire response will be shown to the user as the chat reply:"""
PIPELINE_LOG.info("prepare_generation END refusal=%s has_system_prompt=%s has_user_prompt=%s steps=%d",
False, bool(system_prompt), bool(user_prompt), len(steps_log))
return (None, system_prompt, user_prompt, steps_log)
def generate_response(self, query: str, history, api_key: str):
"""ื™ืฆื™ืจืช ืชืฉื•ื‘ื” ื—ื›ืžื” ืขื ื›ืœ 10 ื”ืขืฆื•ืช"""
if not api_key:
return "Error: Gemini API Key is missing."
# Prepare processing log for UX transparency
processing_steps = []
# Check cache for identical queries
cache_key = self._get_cache_key(query)
if cache_key in self.response_cache:
# Return cached response but include note about cache
cached = self.response_cache[cache_key]
return f"๐Ÿ” Returned cached result\n\n{cached}"
genai.configure(api_key=api_key)
# Step 1 - Normalization
processing_steps.append("๐Ÿ” Normalizing car names...")
canonical = self._normalize_car_name(query)
if canonical:
search_query = canonical
else:
search_query = query
# ืขืฆื” 7: ื–ื™ื”ื•ื™ ืฉืืœื•ืช ื”ืฉื•ื•ืืชื™ื•ืช
is_comparison = self._is_comparison_question(query)
# Policy guard: do not recommend models without auto.co.il articles in our KB
ordered_supported = self._get_ordered_supported_canonicals_in_text(query)
# Show user which cars were identified (for comparison: both; for single: one)
if is_comparison:
if len(ordered_supported) >= 2:
names = ", ".join(self.CANONICAL_TO_DISPLAY.get(c, c) for c in ordered_supported[:2])
processing_steps.append(f"โœ… ื–ื™ื”ื•ื™ ื“ื’ืžื™ื ืœื”ืฉื•ื•ืื”: {names}")
elif len(ordered_supported) == 1:
one_display = self.CANONICAL_TO_DISPLAY.get(ordered_supported[0], ordered_supported[0])
processing_steps.append(f"โœ… ื–ื™ื”ื•ื™ ื“ื’ื ืื—ื“ (ื”ืฉื ื™ ืœื ื‘ืจืฉื™ืžื”): {one_display}")
else:
if canonical:
processing_steps.append(f"โœ… Recognized canonical id: {canonical}")
else:
processing_steps.append("โ„น๏ธ No canonical car found; using full query for search")
if is_comparison:
if len(ordered_supported) == 0:
return self._unsupported_car_refusal(query, is_comparison=True)
else:
if not canonical and not ordered_supported and self._looks_like_specific_car_question(query):
return self._unsupported_car_refusal(query, is_comparison=False)
if is_comparison:
if len(ordered_supported) >= 2:
car1_can, car2_can = ordered_supported[0], ordered_supported[1]
car1_display = self.CANONICAL_TO_DISPLAY.get(car1_can, car1_can)
car2_display = self.CANONICAL_TO_DISPLAY.get(car2_can, car2_can)
processing_steps.append("๐Ÿ“Š Extracting structured comparison data...")
comparison_data = self._extract_comparison_data(car1_can, car2_can)
context_results = self._hybrid_search(search_query, top_k=self.max_chunks_comparison)
comparison_prompt = f"""
Based on the car reviews, create a structured comparison between {car1_display} and {car2_display}:
Format your response as:
**ื™ืชืจื•ื ื•ืช {car1_display}:**
- [list advantages]
**ื™ืชืจื•ื ื•ืช {car2_display}:**
- [list advantages]
**ื”ืžืœืฆื” ืœืคื™ ืคืจื•ืคื™ืœ ืžืฉืชืžืฉ:**
- [personalized recommendation]
Structured Data:
{json.dumps(comparison_data, ensure_ascii=False, indent=2)}
Context from reviews:
"""
elif len(ordered_supported) == 1:
one_can = ordered_supported[0]
one_display = self.CANONICAL_TO_DISPLAY.get(one_can, one_can)
processing_steps.append(f"๐Ÿ“‹ One supported model ({one_display}); providing info only for it")
context_results = self._hybrid_search(one_can, top_k=self.max_chunks_general)
if self._is_hebrew(query):
comparison_prompt = f"""
ื”ืžืฉืชืžืฉ ื‘ื™ืงืฉ ื”ืฉื•ื•ืื”. ืื—ื“ ื”ื“ื’ืžื™ื (ืื• ื™ื•ืชืจ) ืฉื”ื•ื ืฆื™ื™ืŸ **ืœื ื ืžืฆื ื‘ื‘ืกื™ืก ื”ื™ื“ืข ืฉืœื™** โ€“ ืื™ ืืคืฉืจ ืœื”ืฉื•ื•ืช ืœื“ื’ืžื™ื ืฉืœื ืœืžื“ืชื™ ืขืœื™ื”ื.
ื‘ืชืฉื•ื‘ืชืš: ืฆื™ื™ืŸ ื‘ืงืฆืจื” ืฉืื™ื ืš ื™ื›ื•ืœ ืœื”ืฉื•ื•ืช ืœื“ื’ืžื™ื ืฉืœื ืœืžื“ืช ืขืœื™ื”ื, ื•ืื– ืกืคืง ืžื™ื“ืข ืžืœื ืจืง ืขืœ ื”ื“ื’ื ืฉื›ืŸ ื ืžืฆื ื‘ืจืฉื™ืžื”: **{one_display}**, ื‘ื”ืชื‘ืกืก ืขืœ ื”ื”ืงืฉืจ ืœืžื˜ื”.
Context from reviews:
"""
else:
comparison_prompt = f"""
The user asked for a comparison. One or more models they mentioned are **not in my knowledge base** โ€“ I cannot compare to models I haven't learned about.
In your response: briefly state that you cannot compare to models you haven't learned about, then provide full information only about the model that is in my list: **{one_display}**, based on the context below.
Context from reviews:
"""
else:
context_results = self._hybrid_search(search_query, top_k=self.max_chunks_general)
comparison_prompt = "Answer in the same language as the user's question. "
else:
processing_steps.append("๐Ÿ”Ž Searching knowledge base (hybrid vectors + keywords)...")
context_results = self._hybrid_search(search_query, top_k=self.max_chunks_general)
comparison_prompt = ""
# ื‘ื ื™ื™ื” ืฉืœ ื”ืงืฉืจ ืขื ืžื˜ื-ื“ืื˜ื”
context_text = ""
for r in context_results:
meta = r['metadata']
context_text += f"""
Source: {meta['title']}
Topic: {meta['topic']}
Content: {r['text'][:self.max_context_chars_per_chunk]}...
"""
# ืขืฆื” 10: ืฉืžื™ืจืช ื”ืงืฉืจ ืžื”ื™ืกื˜ื•ืจื™ื”
conversation_context = self._get_context_from_history()
# ืขืฆื” 9: ืคืจื•ืžืคื˜ ืžื•ืชืื
system_prompt = """You are an expert automotive assistant. Your answer MUST be based only on the "Context from car reviews" provided in the user message.
Your task: aggregate and summarize the information from that context and give a detailed, verbal answer as a car expert would to a friend. Always output a full paragraph (or more) that directly answers the user's questionโ€”never leave the answer empty or vague.
Respond in the same language as the user (Hebrew or English). For comparison questions, provide a structured analysis with clear advantages for each vehicle.
If the context is empty or irrelevant, say you have no information from your knowledge base for this question."""
prompt = f"""Context from car reviews:
{context_text if context_text.strip() else "(No matching chunks found.)"}
Previous conversation context (last turns):
{conversation_context}
User question: {query}
{comparison_prompt}
Based on the context above, provide a clear answer that aggregates the information and answers the user's question:"""
# Prepare generation step
processing_steps.append("๐Ÿ’ญ Generating response with Gemini...")
# Use new rate-limited API call with backoff and caching
# Prefer fast Flash models for latency; fall back only within Flash tier.
models_to_try = ['gemini-2.0-flash', 'gemini-1.5-flash']
response_text = self._call_api_with_backoff(system_prompt, prompt, models_to_try)
# Cache only successful responses (avoid caching transient rate limit/errors)
if not (response_text.startswith("โš ๏ธ") or response_text.startswith("โŒ")):
self.response_cache[cache_key] = response_text
# ืขืฆื” 10: ืฉืžื™ืจืช ื”ืชืฉื•ื‘ื” ื‘ื”ื™ืกื˜ื•ืจื™ื”
self._maintain_conversation_history(query, response_text)
# Prepend processing steps for UX transparency
processing_header = "\n".join(processing_steps)
full_response = f"{processing_header}\n\n{response_text}"
return full_response
def generate_response_stream(self, query: str, history, api_key: str):
"""
Stream progress: show each pipeline step as it completes (no generic placeholders).
Only one LLM call at the end; normalization, comparison detection, and search are offline/rule-based.
"""
if not api_key:
yield "Error: Gemini API Key is missing."
return
def steps_text() -> str:
return "\n".join(processing_steps)
processing_steps: List[str] = []
cache_key = self._get_cache_key(query)
if cache_key in self.response_cache:
yield f"๐Ÿ” Returned cached result\n\n{self.response_cache[cache_key]}"
return
genai.configure(api_key=api_key)
# --- Step 1: Normalization (rule-based, no LLM) ---
processing_steps.append("๐Ÿ” Normalizing car names...")
yield steps_text()
canonical = self._normalize_car_name(query)
if canonical:
search_query = canonical
else:
search_query = query
# --- Step 2: Question type (rule-based regex/keywords, no LLM) ---
is_comparison = self._is_comparison_question(query)
if is_comparison:
processing_steps.append("๐Ÿ“‹ Detected: comparison question (rule-based)")
else:
processing_steps.append("๐Ÿ“‹ Detected: single-model question (rule-based)")
yield steps_text()
ordered_supported = self._get_ordered_supported_canonicals_in_text(query)
# Show user which cars were identified (for comparison: both; for single: one)
if is_comparison:
if len(ordered_supported) >= 2:
names = ", ".join(self.CANONICAL_TO_DISPLAY.get(c, c) for c in ordered_supported[:2])
processing_steps.append(f"โœ… ื–ื™ื”ื•ื™ ื“ื’ืžื™ื ืœื”ืฉื•ื•ืื”: {names}")
elif len(ordered_supported) == 1:
one_display = self.CANONICAL_TO_DISPLAY.get(ordered_supported[0], ordered_supported[0])
processing_steps.append(f"โœ… ื–ื™ื”ื•ื™ ื“ื’ื ืื—ื“ (ื”ืฉื ื™ ืœื ื‘ืจืฉื™ืžื”): {one_display}")
else:
if canonical:
processing_steps.append(f"โœ… Recognized canonical id: {canonical}")
else:
processing_steps.append("โ„น๏ธ No canonical car found; using full query for search")
yield steps_text()
if is_comparison:
if len(ordered_supported) == 0:
yield self._unsupported_car_refusal(query, is_comparison=True)
return
else:
if not canonical and not ordered_supported and self._looks_like_specific_car_question(query):
yield self._unsupported_car_refusal(query, is_comparison=False)
return
# --- Step 3: Search (index built offline; only query embedding at runtime) ---
processing_steps.append("๐Ÿ”Ž Searching knowledge base (vectors + keywords)...")
yield steps_text()
comparison_prompt = ""
context_results = []
if is_comparison:
if len(ordered_supported) >= 2:
car1_can, car2_can = ordered_supported[0], ordered_supported[1]
car1_display = self.CANONICAL_TO_DISPLAY.get(car1_can, car1_can)
car2_display = self.CANONICAL_TO_DISPLAY.get(car2_can, car2_can)
processing_steps.append("๐Ÿ“Š Extracting structured comparison data (regex)...")
yield steps_text()
comparison_data = self._extract_comparison_data(car1_can, car2_can)
context_results = self._hybrid_search(search_query, top_k=self.max_chunks_comparison)
processing_steps.append(f"โœ… Retrieved {len(context_results)} chunks for comparison")
yield steps_text()
comparison_prompt = f"""
Based on the car reviews, create a structured comparison between {car1_display} and {car2_display}:
Format your response as:
**ื™ืชืจื•ื ื•ืช {car1_display}:**
- [list advantages]
**ื™ืชืจื•ื ื•ืช {car2_display}:**
- [list advantages]
**ื”ืžืœืฆื” ืœืคื™ ืคืจื•ืคื™ืœ ืžืฉืชืžืฉ:**
- [personalized recommendation]
Structured Data:
{json.dumps(comparison_data, ensure_ascii=False, indent=2)}
Context from reviews:
"""
elif len(ordered_supported) == 1:
one_can = ordered_supported[0]
one_display = self.CANONICAL_TO_DISPLAY.get(one_can, one_can)
processing_steps.append(f"๐Ÿ“‹ One supported model ({one_display}); providing info only for it")
context_results = self._hybrid_search(one_can, top_k=self.max_chunks_general)
processing_steps.append(f"โœ… Retrieved {len(context_results)} chunks")
yield steps_text()
if self._is_hebrew(query):
comparison_prompt = f"""
ื”ืžืฉืชืžืฉ ื‘ื™ืงืฉ ื”ืฉื•ื•ืื”. ืื—ื“ ื”ื“ื’ืžื™ื (ืื• ื™ื•ืชืจ) ืฉื”ื•ื ืฆื™ื™ืŸ **ืœื ื ืžืฆื ื‘ื‘ืกื™ืก ื”ื™ื“ืข ืฉืœื™** โ€“ ืื™ ืืคืฉืจ ืœื”ืฉื•ื•ืช ืœื“ื’ืžื™ื ืฉืœื ืœืžื“ืชื™ ืขืœื™ื”ื.
ื‘ืชืฉื•ื‘ืชืš: ืฆื™ื™ืŸ ื‘ืงืฆืจื” ืฉืื™ื ืš ื™ื›ื•ืœ ืœื”ืฉื•ื•ืช ืœื“ื’ืžื™ื ืฉืœื ืœืžื“ืช ืขืœื™ื”ื, ื•ืื– ืกืคืง ืžื™ื“ืข ืžืœื ืจืง ืขืœ ื”ื“ื’ื ืฉื›ืŸ ื ืžืฆื ื‘ืจืฉื™ืžื”: **{one_display}**, ื‘ื”ืชื‘ืกืก ืขืœ ื”ื”ืงืฉืจ ืœืžื˜ื”.
Context from reviews:
"""
else:
comparison_prompt = f"""
The user asked for a comparison. One or more models they mentioned are **not in my knowledge base** โ€“ I cannot compare to models I haven't learned about.
In your response: briefly state that you cannot compare to models you haven't learned about, then provide full information only about the model that is in my list: **{one_display}**, based on the context below.
Context from reviews:
"""
else:
context_results = self._hybrid_search(search_query, top_k=self.max_chunks_general)
comparison_prompt = "Answer in the same language as the user's question. "
processing_steps.append(f"โœ… Retrieved {len(context_results)} chunks")
yield steps_text()
else:
context_results = self._hybrid_search(search_query, top_k=self.max_chunks_general)
processing_steps.append(f"โœ… Retrieved {len(context_results)} relevant chunks")
yield steps_text()
# --- Build prompt (no LLM) ---
context_text = ""
for r in context_results:
meta = r['metadata']
context_text += f"""
Source: {meta['title']}
Topic: {meta['topic']}
Content: {r['text'][:self.max_context_chars_per_chunk]}...
"""
conversation_context = self._get_context_from_history()
system_prompt = """You are an expert automotive assistant. Your answer MUST be based only on the "Context from car reviews" provided in the user message.
Your task: aggregate and summarize the information from that context and give a detailed, verbal answer as a car expert would to a friend. Always output a full paragraph (or more) that directly answers the user's questionโ€”never leave the answer empty or vague.
Respond in the same language as the user (Hebrew or English). For comparison questions, provide a structured analysis with clear advantages for each vehicle.
If the context is empty or irrelevant, say you have no information from your knowledge base for this question."""
prompt = f"""Context from car reviews:
{context_text if context_text.strip() else "(No matching chunks found.)"}
Previous conversation context (last turns):
{conversation_context}
User question: {query}
{comparison_prompt}
Based on the context above, provide a clear answer that aggregates the information and answers the user's question:"""
# --- Step 4: Single LLM call (streamed) ---
processing_steps.append("๐Ÿ’ญ Generating response with Gemini...")
yield steps_text()
models_to_try = ['gemini-2.0-flash', 'gemini-1.5-flash']
response_text = ""
for partial in self._call_api_with_backoff_stream(system_prompt, prompt, models_to_try):
response_text = partial
yield f"{steps_text()}\n\n{response_text}"
# Don't cache errors or timeouts
if not any(response_text.startswith(p) for p in ("โš ๏ธ", "โŒ", "โฑ๏ธ")):
self.response_cache[cache_key] = response_text
self._maintain_conversation_history(query, response_text)
processing_steps.append("โœ… Done")
yield f"{steps_text()}\n\n{response_text}"
# Simple test block
if __name__ == "__main__":
# Create dummy file if not exists for testing import
if not os.path.exists("scraped_data.json"):
print("No data found, skipping test.")
else:
engine = RAGEngine()
res = engine.retrieve("How is the Kia EV9?")
print(f"Top result: {res[0]['text'][:100]}...")