import json
import numpy as np
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
import os
import re
import requests
from collections import defaultdict
from typing import List, Dict, Tuple, Optional
import time
import hashlib
import random
import threading
import logging

# Pipeline logging: visible in HF Spaces logs and helps debug RAG flow
PIPELINE_LOG = logging.getLogger("pipeline")
if not PIPELINE_LOG.handlers:
    h = logging.StreamHandler()
    h.setFormatter(logging.Formatter("[PIPELINE] %(message)s"))
    PIPELINE_LOG.addHandler(h)
    PIPELINE_LOG.setLevel(logging.INFO)

class RAGEngine:
    def __init__(self, data_path=None):
        print("Initializing RAG Engine with Advanced Features...")
        if data_path is None:
            base_dir = os.path.dirname(__file__)
            self.data_path = os.path.join(base_dir, "data_ingestion", "scraped_data.json")
        else:
            self.data_path = data_path
            
        print(f"Using data path: {self.data_path}")
        # Encoder loaded at startup (before building index). No lazy load.
        self.encoder = None
        self._encoder_model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
        
        # Initialize advanced features
        self.chunks = []
        self.chunk_metadata = []  # עצה 2: מטא-דאטה עשירה
        self.embeddings = None
        self.keyword_index = {}  # עצה 3: חיפוש מילות מפתח
        self.car_normalization = self._build_car_normalization()  # עצה 4: נרמול שמות
        # Regex patterns for robust cold name detection
        self._build_regex_patterns()
        self.conversation_history = []  # עצה 10: היסטוריית שיחה
        
        # Rate limiting and caching
        self.response_cache = {}  # Cache for identical queries
        self.last_request_time = 0  # Track last API request time
        # Minimum delay between requests (seconds). Can be tuned via env.
        # Minimal delay; rate-limit backoff handles 429s.
        self.request_delay = float(os.environ.get("GEMINI_REQUEST_DELAY", "0.5"))
        # Thread-safe throttling + shared cooldown across concurrent requests.
        self._rate_limit_lock = threading.Lock()
        self._rate_limited_until = 0.0

        # Speed/size knobs (reduce prompt size + generation time)
        self.max_chunks_general = int(os.environ.get("RAG_TOP_K_GENERAL", "4"))
        self.max_chunks_comparison = int(os.environ.get("RAG_TOP_K_COMPARISON", "6"))
        self.max_context_chars_per_chunk = int(os.environ.get("RAG_CONTEXT_CHARS_PER_CHUNK", "280"))
        self.max_output_tokens = int(os.environ.get("GEMINI_MAX_OUTPUT_TOKENS", "600"))
        self._api_timeout_seconds = int(os.environ.get("GEMINI_API_TIMEOUT", "45"))
        self._generation_config = {
            "max_output_tokens": self.max_output_tokens,
            "temperature": 0.4,
        }
        
        self._load_and_process_data()
        # Build or load chunk embeddings. Encoder is loaded only when building (no cache); otherwise lazy on first search.
        self._build_index()
        print("RAG Engine Initialized (encoder + all embeddings ready at startup).")

    def _build_car_normalization(self) -> Dict[str, str]:
        """עצה 4: מילון נרמול שמות רכבים (עברית + אנגלית)"""
        return {
            # טויוטה קורולה
            'קורולה': 'toyota_corolla',
            'toyota corolla': 'toyota_corolla',
            'טויוטה קורולה': 'toyota_corolla',
            'corolla': 'toyota_corolla',

            # סיטרואן C3
            'c3': 'citroen_c3',
            'citroen c3': 'citroen_c3',
            'סיטרואן c3': 'citroen_c3',
            'c3 החדשה': 'citroen_c3',

            # אודי RS3
            'rs3': 'audi_rs3',
            'audi rs3': 'audi_rs3',
            'אודי rs3': 'audi_rs3',

            # קיה EV9
            'ev9': 'kia_ev9',
            'kia ev9': 'kia_ev9',
            'קיה ev9': 'kia_ev9',

            # MG S6
            's6': 'mg_s6',
            'mg s6': 'mg_s6',
            'mg-s6': 'mg_s6',

            # יונדאי אלנטרה N
            'elantra n': 'hyundai_elantra_n',
            'אלנטרה n': 'hyundai_elantra_n',
            'elantra': 'hyundai_elantra_n',

            # איון HT
            'aion ht': 'aion_ht',
            'ht': 'aion_ht',
            'איון ht': 'aion_ht',

            # ג'נסיס GV80
            'genesis gv80': 'genesis_gv80',
            'gv80': 'genesis_gv80',
            "ג'נסיס gv80": 'genesis_gv80',

            # Link & Co 01 (support "and", "&", Hebrew לינק אנד/& קו)
            'link & co 01': 'link_co_01',
            'link co 01': 'link_co_01',
            'link and co 01': 'link_co_01',
            'link and co': 'link_co_01',
            "לינק אנד קו 01": 'link_co_01',
            "לינק אנד קו": 'link_co_01',
            "לינק & קו 01": 'link_co_01',
            "לינק & קו": 'link_co_01',
        }

    def _chunk_by_topic(self, text: str, title: str, url: str) -> List[Dict]:
        """עצה 1: חלוקת מידע לחתיכות לפי נושאים"""
        chunks_list = []
        
        # נושאים עיקריים לחתוך
        topics = {
            'מפרט טכני|ביצועים|מנוע|תיבה|הנעה': 'technical_specs',
            'בטיחות|מערכות בטיחות|בלמים': 'safety',
            'מחיר|מחירה': 'price',
            'עיצוב|מראה|חיצוני': 'design',
            'נוחות|התנהלות|משבושים': 'comfort',
            'צריכה|טווח|טעינה': 'efficiency',
            'דינמיקה|הגה|ביצועים דינמיים': 'dynamic',
        }
        
        # חלוקה בסיסית לפסקאות
        paragraphs = text.split('\n')
        current_chunk = []
        current_topic = 'general'
        
        for para in paragraphs:
            if len(para.strip()) < 20:
                continue
            
            # זיהוי נושא
            for pattern, topic in topics.items():
                if re.search(pattern, para, re.IGNORECASE):
                    if current_chunk and current_topic != topic:
                        chunk_text = '\n'.join(current_chunk)
                        if len(chunk_text) > 50:
                            chunks_list.append({
                                'text': chunk_text,
                                'topic': current_topic,
                                'title': title,
                                'url': url
                            })
                        current_chunk = []
                    current_topic = topic
                    break
            
            current_chunk.append(para)
        
        # הוספת הנתון האחרון
        if current_chunk:
            chunk_text = '\n'.join(current_chunk)
            if len(chunk_text) > 50:
                chunks_list.append({
                    'text': chunk_text,
                    'topic': current_topic,
                    'title': title,
                    'url': url
                })
        
        return chunks_list

    def _normalize_car_name(self, text: str) -> str:
        """עצה 4: נרמול שמות רכבים בטקסט

        Returns canonical id (e.g. 'audi_rs3') if matched, else returns None.
        Uses regex_patterns first, then falls back to simple variation map.
        """
        if not text:
            return None

        txt = text.lower()

        # Try regex patterns (robust, multilingual, handles spaces/hyphens)
        for pattern, canonical in getattr(self, 'regex_patterns', {}).items():
            try:
                if re.search(pattern, txt):
                    return canonical
            except re.error:
                # Skip invalid patterns (shouldn't happen)
                continue

        # Fallback: match known variants as whole words
        for variant, canonical in self.car_normalization.items():
            if re.search(rf"\b{re.escape(variant.lower())}\b", txt):
                return canonical

        return None

    def _build_regex_patterns(self):
        """בנה תבניות רגקס חכמות לזיהוי שמות רכבים (כולל עברית – בלי \\b על עברית)"""
        # Patterns: English use \\b; Hebrew has no word boundary (אלנטרה can appear as "לאלנטרה")
        self.regex_patterns = {
            r'\baudi[\s\-]*rs\s*3\b': 'audi_rs3',
            r'\bcitroen[\s\-]*c\s*3\b': 'citroen_c3',
            r'\bc\s*3\b': 'citroen_c3',
            r'\bkia[\s\-]*ev\s*9\b': 'kia_ev9',
            r'\bev\s*9\b': 'kia_ev9',
            r'\bhyundai[\s\-]*elantra\s*n\b': 'hyundai_elantra_n',
            r'\belantra\s*n\b': 'hyundai_elantra_n',
            r'אלנטרה\s*[nN]?': 'hyundai_elantra_n',  # Hebrew: "אלנטרה" or "אלנטרה N" (no \\b – "לאלנטרה" ok)
            r'\baion\s*ht\b': 'aion_ht',
            r'\bgenesis[\s\-]*gv\s*80\b': 'genesis_gv80',
            r'\bgv\s*80\b': 'genesis_gv80',
            r'\blink\s*(?:&|and)\s*co\.?\s*01\b': 'link_co_01',
            r'\blink\s*&?\s*co\s*01\b': 'link_co_01',
            r'לינק\s*(?:&|אנד)\s*קו\s*01?': 'link_co_01',  # Hebrew: לינק אנד/& קו 01
            r'\brs\s*3\b': 'audi_rs3',
            r'\bcorolla\b': 'toyota_corolla',
        }

    def _extract_keywords(self, text: str) -> List[str]:
        """עצה 3: חילוץ מילות מפתח"""
        # חילוץ מילות עם משמעות (בעברית ואנגלית)
        keywords = []
        
        # דוגמאות של מילות מפתח חשובות
        important_words = [
            r'\b\d+\s*כ"ס\b',  # קוח
            r'\b\d+\s*קמ"ש\b',  # קמ"ש
            r'\b\d+\.?\d*\s*שניות?\b',  # זמן האצה
            r'\b\d+\s*ליטר\b',  # נפח
            r'טורבו|היברידי|חשמלי|דו-מצמדית|ידנית',  # סוגי הנעה
            r'מנוע|בטיחות|נוחות|עיצוב|מחיר',  # קטגוריות
        ]
        
        for pattern in important_words:
            matches = re.findall(pattern, text, re.IGNORECASE)
            keywords.extend(matches)
        
        return list(set(keywords))

    def _load_and_process_data(self):
        """עצות 1+2: טעינה וחלוקה חכמה עם מטא-דאטה"""
        with open(self.data_path, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)
        
        self.chunks = []
        self.chunk_metadata = []
        
        for article in raw_data:
            text = article['content']
            url = article['url']
            title = article['title']
            
            # נרמול שמות רכבים בטקסט (שמרו את התוצאה בנפרד)
            normalized_car = self._normalize_car_name(text)

            # חלוקה חכמה לפי נושאים - השתמשו בטקסט המלא (לא בערך המנורמל)
            topic_chunks = self._chunk_by_topic(text, title, url)
            
            for chunk_data in topic_chunks:
                chunk_text = chunk_data['text']
                
                # עצה 2: מטא-דאטה עשירה לכל חתיכה
                keywords = self._extract_keywords(chunk_text)
                
                metadata = {
                    "title": chunk_data['title'],
                    "url": chunk_data['url'],
                    "topic": chunk_data['topic'],
                    "keywords": keywords,
                    "publish_date": "2024-2025",  # מידע פרסום
                    "car_type": self._extract_car_type(chunk_data['title']),
                    "length": len(chunk_text)
                }
                
                self.chunks.append(chunk_text)
                self.chunk_metadata.append(metadata)
                
                # עצה 3: בנייה של אינדקס מילות מפתח
                for keyword in keywords:
                    if keyword not in self.keyword_index:
                        self.keyword_index[keyword] = []
                    self.keyword_index[keyword].append(len(self.chunks) - 1)
        
        print(f"Created {len(self.chunks)} smart chunks from {len(raw_data)} articles with rich metadata.")

    def _extract_car_type(self, title: str) -> str:
        """זיהוי סוג הרכב"""
        types_map = {
            'C3': 'supermini',
            'RS3': 'compact',
            'EV9': 'suv',
            'S6': 'suv',
            'אלנטרה': 'sedan',
            'Elantra': 'sedan',
            'HT': 'suv',
            'לינק': 'compact',  # Link & Co 01
            '01': 'compact',   # Link & Co 01 (title contains "01")
        }
        for key, type_val in types_map.items():
            if key in title:
                return type_val
        return 'unknown'

    def _get_encoder(self):
        """Load encoder once (called at startup before _build_index)."""
        if self.encoder is None:
            print("Loading embedding model...")
            self.encoder = SentenceTransformer(self._encoder_model_name)
            print("Embedding model loaded.")
        return self.encoder

    def _embeddings_path(self) -> Tuple[str, str]:
        """Path to saved embeddings and meta (same dir as scraped_data.json)."""
        data_dir = os.path.dirname(self.data_path)
        return (
            os.path.join(data_dir, "chunk_embeddings.npy"),
            os.path.join(data_dir, "chunk_embeddings_meta.json"),
        )

    def _build_index(self):
        """Build or load chunk vectors. Saves to disk so next startup loads from file (no encoder over chunks)."""
        if self.embeddings is not None:
            return
        emb_path, meta_path = self._embeddings_path()
        n_chunks = len(self.chunks)

        # Try load existing vectors (only if chunk count matches)
        if os.path.isfile(emb_path) and os.path.isfile(meta_path):
            try:
                with open(meta_path, "r", encoding="utf-8") as f:
                    meta = json.load(f)
                if meta.get("n_chunks") == n_chunks and meta.get("model") == self._encoder_model_name:
                    self.embeddings = np.load(emb_path)
                    if self.embeddings.shape[0] == n_chunks:
                        print(f"Loaded {n_chunks} embeddings from {emb_path}")
                        return
            except Exception as e:
                print(f"Could not load saved embeddings: {e}. Rebuilding...")

        # Build and save
        print("Building chunk embeddings...")
        encoder = self._get_encoder()
        self.embeddings = encoder.encode(self.chunks, batch_size=32)
        norm = np.linalg.norm(self.embeddings, axis=1, keepdims=True)
        self.embeddings = self.embeddings / norm

        os.makedirs(os.path.dirname(emb_path) or ".", exist_ok=True)
        np.save(emb_path, self.embeddings)
        with open(meta_path, "w", encoding="utf-8") as f:
            json.dump({"n_chunks": n_chunks, "model": self._encoder_model_name}, f, indent=0)
        print(f"Saved {n_chunks} embeddings to {emb_path}")

    def _hybrid_search(self, query: str, top_k: int = 5) -> List[Dict]:
        """Hybrid search: prebuilt chunk vectors + keyword index. Only the query is embedded at runtime."""
        # Embeddings are built at startup; this is a no-op if already built.
        self._build_index()

        # נרמול השאילתה
        normalized_query = self._normalize_car_name(query)
        # אם הנרמול לא מצא canonical id, השתמש בשאילתה המקורית
        if normalized_query is None:
            normalized_query = query
        
        # חיפוש וקטורי
        # Ensure we pass a string to the encoder
        query_text_for_embedding = normalized_query if isinstance(normalized_query, str) else str(normalized_query)
        encoder = self._get_encoder()
        query_embedding = encoder.encode([query_text_for_embedding])
        query_embedding = query_embedding / np.linalg.norm(query_embedding)
        scores = np.dot(self.embeddings, query_embedding.T).flatten()
        
        # חיפוש מילות מפתח
        keywords = self._extract_keywords(normalized_query)
        keyword_matches = set()
        for keyword in keywords:
            if keyword in self.keyword_index:
                keyword_matches.update(self.keyword_index[keyword])
        
        # שילוב התוצאות
        combined_scores = scores.copy()
        for idx in keyword_matches:
            combined_scores[idx] += 0.3  # בונוס למשחקי מילות מפתח
        
        # בחירת Top K
        top_indices = np.argsort(combined_scores)[-top_k:][::-1]
        
        results = []
        for idx in top_indices:
            results.append({
                "text": self.chunks[idx],
                "metadata": self.chunk_metadata[idx],
                "score": float(combined_scores[idx])
            })
        
        return results

    def retrieve(self, query: str, top_k: int = 5):
        """עצה 3: חיפוש היברידי במקום רק וקטורי"""
        return self._hybrid_search(query, top_k)

    def _extract_comparison_data(self, car1: str, car2: str) -> Dict:
        """עצה 5: חילוץ נתונים מובנים להשוואה"""
        specs_map = {
            'power': r'(\d+)\s*כ"ס',
            'torque': r'(\d+\.?\d*)\s*קג"מ',
            'acceleration': r'(\d+\.?\d*)\s*שניות?\s*ל-?100',
            'top_speed': r'(\d+)\s*קמ"ש',
            'consumption': r'(\d+\.?\d*)\s*ק"מ/l',
        }
        
        # חיפוש נתונים עבור כל רכב
        car1_data = {}
        car2_data = {}
        
        results1 = self._hybrid_search(car1, top_k=10)
        results2 = self._hybrid_search(car2, top_k=10)
        
        for result in results1:
            for spec, pattern in specs_map.items():
                match = re.search(pattern, result['text'])
                if match and spec not in car1_data:
                    car1_data[spec] = match.group(1)
        
        for result in results2:
            for spec, pattern in specs_map.items():
                match = re.search(pattern, result['text'])
                if match and spec not in car2_data:
                    car2_data[spec] = match.group(1)
        
        return {
            "car1": {"name": car1, "specs": car1_data},
            "car2": {"name": car2, "specs": car2_data}
        }

    def _is_comparison_question(self, query: str) -> bool:
        """Rule-based only (regex/keywords). No LLM. Detects comparison vs single-model questions."""
        if not query:
            return False
        q = query.lower()
        comparison_keywords = [
            'מה יותר טוב', 'השוואה', 'לעומת', 'vs', 'versus', 'compare', 'better than',
            'בין ', ' השוואה', 'להשוות', 'compare between',
        ]
        if any(k in q for k in comparison_keywords):
            return True
        # "X vs Y" or "X ו-Y" / "X and Y" with two model-like tokens
        if re.search(r'\bvs\b|\bversus\b| ו | and | versus ', q, re.IGNORECASE):
            return True
        return False

    def _maintain_conversation_history(self, query: str, response: str, max_turns: int = 5):
        """עצה 10: ניהול היסטוריית שיחה חכמה"""
        self.conversation_history.append({
            "query": query,
            "response": response
        })
        
        # שמירת רק 5 תורות אחרונות
        if len(self.conversation_history) > max_turns:
            self.conversation_history = self.conversation_history[-max_turns:]

    def _get_context_from_history(self) -> str:
        """חילוץ הקשר מהיסטוריית השיחה – Q ו-A כדי שהמודל יהיה מודע לשיחה בהמשך"""
        if not self.conversation_history:
            return ""
        context_lines = []
        for turn in self.conversation_history[-3:]:  # 3 תורות אחרונות
            q = (turn.get("query") or "")[:200]
            a = (turn.get("response") or "")[:300]
            context_lines.append(f"Q: {q}\nA: {a}")
        return "\n\n".join(context_lines)

    def _get_mentioned_cars_in_conversation(self, max_turns: int = 5) -> set:
        """דגמים שמופיעים בהיסטוריית השיחה הנוכחית (שאלות + תשובות) – כדי לדבוק בהם ב־follow-up."""
        mentioned = set()
        if not self.conversation_history:
            return mentioned
        for turn in self.conversation_history[-max_turns:]:
            for key in ("query", "response"):
                text = (turn.get(key) or "")[:1500]
                mentioned.update(self._find_supported_canonicals_in_text(text))
        return mentioned

    def _get_cache_key(self, query: str) -> str:
        """Generate cache key for query"""
        return hashlib.md5(query.lower().encode()).hexdigest()

    @staticmethod
    def _extract_retry_after_seconds(error_text: str) -> Optional[int]:
        """Best-effort parse of Retry-After seconds from an error message."""
        if not error_text:
            return None
        # Common patterns: "Retry-After: 60", "retry_after: 60", "Retry after 60s"
        m = re.search(r"retry[-_\s]*after[:\s]+(\d+)", error_text, re.IGNORECASE)
        if m:
            try:
                return int(m.group(1))
            except Exception:
                return None
        m = re.search(r"retry\s+after\s+(\d+)\s*s", error_text, re.IGNORECASE)
        if m:
            try:
                return int(m.group(1))
            except Exception:
                return None
        return None

    @staticmethod
    def _is_hebrew(text: str) -> bool:
        return bool(re.search(r"[\u0590-\u05FF]", text or ""))

    # Canonical id -> display name for supported models only (allowed to recommend/compare).
    CANONICAL_TO_DISPLAY = {
        "citroen_c3": "Citroen C3",
        "audi_rs3": "Audi RS3",
        "kia_ev9": "Kia EV9",
        "mg_s6": "MG S6",
        "hyundai_elantra_n": "Hyundai Elantra N",
        "aion_ht": "Aion HT",
        "genesis_gv80": "Genesis GV80",
        "link_co_01": "Link & Co 01",
    }

    @classmethod
    def _supported_cars_display(cls) -> List[str]:
        """
        The only car models this app is allowed to recommend/compare.
        Must correspond to articles that exist in `data_ingestion/scraped_data.json`.
        """
        return list(cls.CANONICAL_TO_DISPLAY.values())

    def _find_supported_canonicals_in_text(self, text: str) -> set:
        """Find all supported canonical car ids mentioned in the text (only canonicals we allow)."""
        found = set()
        if not text:
            return found
        txt = text.lower()
        allowed = set(self.CANONICAL_TO_DISPLAY.keys())

        # Regex patterns
        for pattern, canonical in getattr(self, "regex_patterns", {}).items():
            try:
                if canonical in allowed and re.search(pattern, txt):
                    found.add(canonical)
            except re.error:
                continue

        # Variant map
        for variant, canonical in self.car_normalization.items():
            if canonical in allowed and re.search(rf"\b{re.escape(variant.lower())}\b", txt):
                found.add(canonical)

        return found

    def _get_ordered_supported_canonicals_in_text(self, text: str) -> List[str]:
        """Return supported canonicals mentioned in text, in order of first appearance."""
        if not text:
            return []
        txt = text.lower()
        allowed = set(self.CANONICAL_TO_DISPLAY.keys())
        # canonical -> earliest start position
        positions: Dict[str, int] = {}

        for pattern, canonical in getattr(self, "regex_patterns", {}).items():
            if canonical not in allowed:
                continue
            try:
                m = re.search(pattern, txt)
                if m:
                    pos = m.start()
                    if canonical not in positions or pos < positions[canonical]:
                        positions[canonical] = pos
            except re.error:
                continue

        for variant, canonical in self.car_normalization.items():
            if canonical not in allowed:
                continue
            m = re.search(rf"\b{re.escape(variant.lower())}\b", txt)
            if m:
                pos = m.start()
                if canonical not in positions or pos < positions[canonical]:
                    positions[canonical] = pos

        return [c for c in sorted(positions.keys(), key=lambda c: positions[c])]

    @staticmethod
    def _looks_like_specific_car_question(text: str) -> bool:
        """
        Heuristic: decide if user likely asks about a specific car model,
        not a general concept question.
        """
        if not text:
            return False
        t = text.lower()
        # comparison markers
        if re.search(r"\b(vs|versus|compare)\b", t) or any(k in t for k in ["השוואה", "לעומת", "מה יותר טוב", "בין"]):
            return True
        # common “tell me about model” phrasing
        if any(k in t for k in ["tell me about", "what do you think", "review", "מבחן", "דעה על", "מה דעתך", "ספר לי על", "תספר לי על"]):
            return True
        # model-like token patterns (letters+digits, e.g. rs3, ev9, x5)
        if re.search(r"\b[a-z]{1,}\s*\d{1,}\b", t) or re.search(r"\b\d{1,}\s*[a-z]{1,}\b", t) or re.search(r"\b[a-z]{2,}\d{1,}\b", t):
            return True
        return False

    def _unsupported_car_refusal(self, query: str, is_comparison: bool) -> str:
        supported = ", ".join(self._supported_cars_display())
        if self._is_hebrew(query):
            if is_comparison:
                return (
                    "❌ אני יכול להשוות/להמליץ **רק על בסיס מידע שקיים אצלי** מתוך כתבות מ־`auto.co.il`.\n"
                    "נראה שלפחות אחד מהדגמים שביקשת **לא נמצא בבסיס הידע שלי**, ולכן אסור לי להמליץ עליו או להשוות אותו.\n\n"
                    f"✅ דגמים נתמכים כרגע: {supported}\n"
                    "אם תרצה, כתוב השוואה בין שני דגמים מהרשימה."
                )
            return (
                "❌ אני יכול להמליץ **רק על בסיס מידע שקיים אצלי** מתוך כתבות מ־`auto.co.il`.\n"
                "הדגם שביקשת **לא נמצא בבסיס הידע שלי**, ולכן אסור לי להמליץ עליו.\n\n"
                f"✅ דגמים נתמכים כרגע: {supported}\n"
                "אם תכתוב אחד מהדגמים מהרשימה — אשמח לעזור."
            )
        else:
            if is_comparison:
                return (
                    "❌ I can compare/recommend **only using information I have** from articles scraped from `auto.co.il`.\n"
                    "At least one of the models you mentioned is **not in my knowledge base**, so I’m not allowed to recommend or compare it.\n\n"
                    f"✅ Currently supported models: {supported}\n"
                    "If you want, ask for a comparison between two models from this list."
                )
            return (
                "❌ I can recommend **only using information I have** from articles scraped from `auto.co.il`.\n"
                "The model you asked about is **not in my knowledge base**, so I’m not allowed to recommend it.\n\n"
                f"✅ Currently supported models: {supported}\n"
                "Ask about one of these models and I’ll help."
            )

    def _wait_for_rate_limit(self):
        """Enforce minimum delay between API requests to avoid rate limiting"""
        # Thread-safe: Gradio can execute requests concurrently.
        with self._rate_limit_lock:
            now = time.time()

            # Honor global cooldown after a 429.
            if now < self._rate_limited_until:
                time.sleep(self._rate_limited_until - now)
                now = time.time()

            elapsed = now - self.last_request_time
            if elapsed < self.request_delay:
                time.sleep(self.request_delay - elapsed)
            self.last_request_time = time.time()

    def _get_openrouter_key(self) -> Optional[str]:
        """OpenRouter API key from env. HF Spaces: add Secret OPENROUTER_API_KEY. Local: .env openRouter_API_KEY."""
        for name in ("OPENROUTER_API_KEY", "openRouter_API_KEY", "OPENROUTER_APIKEY", "OPENROUTER_KEY"):
            v = os.environ.get(name)
            if v and str(v).strip():
                return str(v).strip()
        return None

    def _call_openrouter(self, system_prompt: str, prompt: str, timeout_seconds: int = 28) -> Optional[str]:
        """Call OpenRouter API for a fast response. Returns text or None on failure."""
        key = self._get_openrouter_key()
        if not key or not key.strip():
            PIPELINE_LOG.info("OpenRouter key not set - add Secret OPENROUTER_API_KEY in HF Space settings. Using Gemini.")
            return None
        url = "https://openrouter.ai/api/v1/chat/completions"
        # Prefer fast Gemini on OpenRouter (gemini-2.0-flash-exp:free was deprecated/404; use gemini-3-flash-preview)
        model = os.environ.get("OPENROUTER_MODEL", "google/gemini-3-flash-preview")
        payload = {
            "model": model,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt},
            ],
            "max_tokens": self.max_output_tokens,
            "temperature": 0.4,
        }
        headers = {"Authorization": f"Bearer {key.strip()}", "Content-Type": "application/json"}
        try:
            self._wait_for_rate_limit()
            PIPELINE_LOG.info("Calling OpenRouter model=%s timeout=%ds", model, timeout_seconds)
            r = requests.post(url, json=payload, headers=headers, timeout=timeout_seconds)
            r.raise_for_status()
            data = r.json()
            choices = data.get("choices") or []
            if not choices:
                return None
            content = (choices[0].get("message") or {}).get("content") or ""
            if not content or not str(content).strip():
                return None
            out = str(content).strip()
            PIPELINE_LOG.info("OpenRouter response OK len=%d", len(out))
            return out
        except Exception as e:
            PIPELINE_LOG.warning("OpenRouter failed: %s", str(e)[:120])
            return None

    def _call_api_with_backoff(self, system_prompt: str, prompt: str, models: List[str]):
        """When OPENROUTER_API_KEY is set: use only OpenRouter (no Gemini). Else: use Gemini with backoff."""
        PIPELINE_LOG.info("_call_api_with_backoff START models=%s prompt_len=%d", models, len(prompt))

        openrouter_key = self._get_openrouter_key()
        if openrouter_key:
            # Generation: use only OpenRouter when key is set (avoid Gemini rate limit)
            PIPELINE_LOG.info("OpenRouter key present - using OpenRouter only for generation (no Gemini)")
            for attempt in range(2):
                result = self._call_openrouter(system_prompt, prompt, timeout_seconds=35)
                if result:
                    return result
                PIPELINE_LOG.warning("OpenRouter attempt %d failed, retrying...", attempt + 1)
            return (
                "❌ OpenRouter request failed after retries. Check OPENROUTER_API_KEY and OPENROUTER_MODEL in Space secrets. "
                "See logs for details."
            )

        # No OpenRouter key: use Gemini
        PIPELINE_LOG.info("OpenRouter key not set - using Gemini for generation")
        max_attempts_per_model = 8
        max_rate_limit_wait_s = 180  # wait up to 3 minutes per attempt before retry

        # Try each model
        for model_idx, model in enumerate(models):
            for attempt in range(max_attempts_per_model):
                try:
                    # Wait before API call to respect rate limits
                    self._wait_for_rate_limit()
                    PIPELINE_LOG.info("Calling LLM model=%s attempt=%d", model, attempt + 1)

                    try:
                        model_obj = genai.GenerativeModel(model, system_instruction=system_prompt)
                        contents = prompt
                    except TypeError:
                        model_obj = genai.GenerativeModel(model)
                        contents = system_prompt + "\n\n" + prompt
                    response = model_obj.generate_content(
                        contents,
                        generation_config=self._generation_config,
                    )
                    text = getattr(response, "text", None) if response else None
                    if not (text and str(text).strip()):
                        PIPELINE_LOG.warning("LLM returned empty or no text")
                        return "❌ The model returned no text (possibly blocked). Please try rephrasing."
                    out = str(text).strip()
                    PIPELINE_LOG.info("LLM response OK len=%d preview=%s", len(out), (out[:200] + "..." if len(out) > 200 else out))
                    return out
                    
                except Exception as e:
                    error_text = str(e)
                    error_msg = error_text.lower()
                    
                    # Handle rate limit errors - wait longer and retry more
                    if "429" in error_msg or "rate" in error_msg or "quota" in error_msg or "too many" in error_msg:
                        retry_after = self._extract_retry_after_seconds(error_text) or 0
                        backoff = min(max_rate_limit_wait_s, 10 * (2 ** min(attempt, 5)))
                        jitter = random.uniform(0.0, 1.0)
                        wait_time = min(max_rate_limit_wait_s, max(retry_after, backoff) + jitter)

                        # Global cooldown so concurrent calls don't stampede.
                        with self._rate_limit_lock:
                            self._rate_limited_until = max(self._rate_limited_until, time.time() + wait_time)

                        print(f"⚠️ Rate limited on {model}. Waiting {wait_time:.1f}s before retry ({attempt + 1}/{max_attempts_per_model})...")
                        time.sleep(wait_time)

                        # Retry same model unless attempts exhausted.
                        if attempt < max_attempts_per_model - 1:
                            continue
                        if model_idx < len(models) - 1:
                            print("⚠️ Rate limit persists. Trying next model...")
                            break
                        msg = "⚠️ API Rate Limit: המתין כ־2–3 דקות ונסה שוב. / Please wait 2–3 minutes and try again."
                        PIPELINE_LOG.warning("_call_api_with_backoff returning rate-limit message")
                        return msg
                    
                    # Handle 404 errors - model not available, try next one
                    elif "404" in error_msg or "not found" in error_msg or "not supported" in error_msg:
                        if model_idx < len(models) - 1:
                            print(f"⚠️ Model {model} not available. Trying next model...")
                            time.sleep(2)
                            break  # Move to next model
                        else:
                            PIPELINE_LOG.warning("_call_api_with_backoff no available models")
                            return f"❌ No available models. Please try again later."
                    
                    # Other errors - retry with same model once
                    else:
                        if attempt < 2:
                            sleep_s = 1.5 * (attempt + 1)
                            print(f"⚠️ Error: {str(e)[:80]}. Retrying in {sleep_s:.1f}s...")
                            time.sleep(sleep_s)
                            continue

                        # If retry also failed, try next model
                        if model_idx < len(models) - 1:
                            print("⚠️ Trying next model...")
                            break
                        PIPELINE_LOG.warning("_call_api_with_backoff error: %s", str(e)[:150])
                        return f"❌ Error: {str(e)[:100]}"
        
        PIPELINE_LOG.warning("_call_api_with_backoff exhausted all models, returning failure")
        return "❌ Failed to get response from API"

    def _call_api_with_backoff_stream(self, system_prompt: str, prompt: str, models: List[str]):
        """
        Streaming version: yields incremental text while generating.
        Returns the final text via StopIteration.value (internal use) and also yields it progressively.
        """
        max_attempts_per_model = 8
        max_rate_limit_wait_s = 180

        for model_idx, model in enumerate(models):
            for attempt in range(max_attempts_per_model):
                try:
                    self._wait_for_rate_limit()
                    try:
                        model_obj = genai.GenerativeModel(model, system_instruction=system_prompt)
                        contents = prompt
                    except TypeError:
                        model_obj = genai.GenerativeModel(model)
                        contents = system_prompt + "\n\n" + prompt
                    stream = model_obj.generate_content(
                        contents,
                        generation_config=self._generation_config,
                        stream=True,
                    )

                    acc = ""
                    start = time.time()
                    timeout = self._api_timeout_seconds
                    for chunk in stream:
                        if time.time() - start > timeout:
                            yield (acc + "\n\n⏱️ Request timed out. Partial response above. Try again or shorten the query.") if acc else "⏱️ Request timed out. Please try again."
                            return
                        piece = getattr(chunk, "text", "") or ""
                        if not piece:
                            continue
                        acc += piece
                        yield acc
                    return

                except Exception as e:
                    error_text = str(e)
                    error_msg = error_text.lower()

                    if "429" in error_msg or "rate" in error_msg or "quota" in error_msg or "too many" in error_msg:
                        retry_after = self._extract_retry_after_seconds(error_text) or 0
                        backoff = min(max_rate_limit_wait_s, 10 * (2 ** min(attempt, 5)))
                        jitter = random.uniform(0.0, 1.0)
                        wait_time = min(max_rate_limit_wait_s, max(retry_after, backoff) + jitter)
                        with self._rate_limit_lock:
                            self._rate_limited_until = max(self._rate_limited_until, time.time() + wait_time)
                        time.sleep(wait_time)
                        if attempt < max_attempts_per_model - 1:
                            continue
                        if model_idx < len(models) - 1:
                            break
                        yield "⚠️ API Rate Limit: המתין כ־2–3 דקות ונסה שוב. / Please wait 2–3 minutes and try again."
                        return

                    if "404" in error_msg or "not found" in error_msg or "not supported" in error_msg:
                        if model_idx < len(models) - 1:
                            time.sleep(1.0)
                            break
                        yield "❌ No available models. Please try again later."
                        return

                    if attempt < 2:
                        time.sleep(1.0 + attempt)
                        continue

                    if model_idx < len(models) - 1:
                        break
                    yield f"❌ Error: {error_text[:120]}"
                    return

        yield "❌ Failed to get response from API"

    def configure_api(self, api_key: str) -> None:
        """Configure Gemini API key (for use by external agent)."""
        genai.configure(api_key=api_key)

    def prepare_generation(self, query: str) -> Tuple[Optional[str], Optional[str], Optional[str], List[str]]:
        """
        Run RAG pipeline up to (but not including) the LLM call.
        Returns (refusal_message, system_prompt, user_prompt, steps_log).
        If refusal_message is set, the other three are None / empty; otherwise use prompts for generation.
        """
        PIPELINE_LOG.info("prepare_generation START query=%r", query[:80] if query else "")
        steps_log: List[str] = []

        steps_log.append("🔍 Normalizing car names...")
        canonical = self._normalize_car_name(query)
        ordered_supported = self._get_ordered_supported_canonicals_in_text(query)
        current_query_cars = set(ordered_supported) if ordered_supported else set()
        if canonical:
            current_query_cars.add(canonical)
            search_query = canonical
        else:
            search_query = query

        # דגמים שהמשתמש כבר דיבר עליהם בשיחה – ב־follow-up נדבוק רק בהם
        mentioned_in_session = self._get_mentioned_cars_in_conversation(max_turns=5)
        is_follow_up = (
            len(mentioned_in_session) > 0
            and (not current_query_cars or current_query_cars <= mentioned_in_session)
        )
        if is_follow_up:
            steps_log.append(f"📌 Follow-up: דבקים בדגמי השיחה – {', '.join(self.CANONICAL_TO_DISPLAY.get(c, c) for c in sorted(mentioned_in_session))}")

        is_comparison = self._is_comparison_question(query)
        if is_comparison:
            steps_log.append("📋 Detected: comparison question (rule-based)")
        else:
            steps_log.append("📋 Detected: single-model question (rule-based)")

        ordered_supported = self._get_ordered_supported_canonicals_in_text(query)
        # Show user which cars were identified (for comparison: both; for single: one)
        if is_comparison:
            if len(ordered_supported) >= 2:
                names = ", ".join(self.CANONICAL_TO_DISPLAY.get(c, c) for c in ordered_supported[:2])
                steps_log.append(f"✅ זיהוי דגמים להשוואה: {names}")
            elif len(ordered_supported) == 1:
                one_display = self.CANONICAL_TO_DISPLAY.get(ordered_supported[0], ordered_supported[0])
                steps_log.append(f"✅ זיהוי דגם אחד (השני לא ברשימה): {one_display}")
        else:
            if canonical:
                steps_log.append(f"✅ Recognized canonical id: {canonical}")
            elif not ordered_supported:
                steps_log.append("ℹ️ No canonical car found; using full query for search")

        if is_comparison:
            if len(ordered_supported) == 0:
                refusal = self._unsupported_car_refusal(query, is_comparison=True)
                PIPELINE_LOG.info("prepare_generation END refusal=True (comparison, no supported) steps=%d", len(steps_log))
                return (refusal, None, None, steps_log)
        else:
            if not canonical and not ordered_supported and self._looks_like_specific_car_question(query):
                refusal = self._unsupported_car_refusal(query, is_comparison=False)
                PIPELINE_LOG.info("prepare_generation END refusal=True (single, unsupported) steps=%d", len(steps_log))
                return (refusal, None, None, steps_log)

        steps_log.append("🔎 Searching knowledge base (vectors + keywords)...")
        comparison_prompt = ""
        context_results = []

        if is_comparison:
            if len(ordered_supported) >= 2:
                car1_can, car2_can = ordered_supported[0], ordered_supported[1]
                car1_display = self.CANONICAL_TO_DISPLAY.get(car1_can, car1_can)
                car2_display = self.CANONICAL_TO_DISPLAY.get(car2_can, car2_can)
                steps_log.append("📊 Extracting structured comparison data (regex)...")
                comparison_data = self._extract_comparison_data(car1_can, car2_can)
                context_results = self._hybrid_search(search_query, top_k=self.max_chunks_comparison)
                steps_log.append(f"✅ Retrieved {len(context_results)} chunks for comparison")
                comparison_prompt = f"""
Based on the car reviews, create a structured comparison between {car1_display} and {car2_display}:

Format your response as:
**יתרונות {car1_display}:**
- [list advantages]

**יתרונות {car2_display}:**
- [list advantages]

**המלצה לפי פרופיל משתמש:**
- [personalized recommendation]

Structured Data:
{json.dumps(comparison_data, ensure_ascii=False, indent=2)}

Context from reviews:
"""
            elif len(ordered_supported) == 1:
                # One model in list, one or more not: can't compare but can tell about the one we know
                one_can = ordered_supported[0]
                one_display = self.CANONICAL_TO_DISPLAY.get(one_can, one_can)
                steps_log.append(f"📋 One supported model ({one_display}); providing info only for it")
                context_results = self._hybrid_search(one_can, top_k=self.max_chunks_general)
                steps_log.append(f"✅ Retrieved {len(context_results)} chunks")
                if self._is_hebrew(query):
                    comparison_prompt = f"""
המשתמש ביקש השוואה. אחד הדגמים (או יותר) שהוא ציין **לא נמצא בבסיס הידע שלי** – אי אפשר להשוות לדגמים שלא למדתי עליהם.
בתשובתך: ציין בקצרה שאינך יכול להשוות לדגמים שלא למדת עליהם, ואז ספק מידע מלא רק על הדגם שכן נמצא ברשימה: **{one_display}**, בהתבסס על ההקשר למטה.

Context from reviews:
"""
                else:
                    comparison_prompt = f"""
The user asked for a comparison. One or more models they mentioned are **not in my knowledge base** – I cannot compare to models I haven't learned about.
In your response: briefly state that you cannot compare to models you haven't learned about, then provide full information only about the model that is in my list: **{one_display}**, based on the context below.

Context from reviews:
"""
            else:
                context_results = self._hybrid_search(search_query, top_k=self.max_chunks_general)
                comparison_prompt = "Answer in the same language as the user's question. "
                steps_log.append(f"✅ Retrieved {len(context_results)} chunks")
        else:
            # ב־follow-up מחפשים רק בהקשר לדגמים שהמשתמש כבר דיבר עליהם
            retrieval_query = search_query
            if is_follow_up and mentioned_in_session:
                retrieval_query = " ".join(
                    self.CANONICAL_TO_DISPLAY.get(c, c) for c in sorted(mentioned_in_session)[:4]
                )
            context_results = self._hybrid_search(retrieval_query, top_k=self.max_chunks_general)
            steps_log.append(f"✅ Retrieved {len(context_results)} relevant chunks")

        context_text = ""
        for r in context_results:
            meta = r['metadata']
            context_text += f"""
Source: {meta['title']}
Topic: {meta['topic']}
Content: {r['text'][:self.max_context_chars_per_chunk]}...

"""
        conversation_context = self._get_context_from_history()
        session_models_instruction = ""
        if is_follow_up and mentioned_in_session:
            session_names = ", ".join(self.CANONICAL_TO_DISPLAY.get(c, c) for c in sorted(mentioned_in_session))
            session_models_instruction = f"""
5. **Session context:** In this chat the user has been discussing only these models: {session_names}. For this follow-up question, answer ONLY in the context of these models. Do not introduce or recommend other models unless (a) the user explicitly asks to compare with another model, or (b) you are giving a brief tip like "אם תרצה השוואה לדגם אחר, אפשר לשאול על..." / "If you want to compare with another model, you can ask about...". Stick to the session models for the main answer.
"""
        system_prompt = """You are an expert automotive assistant. Your reply in the chat MUST be a single, concrete verbal answer that the user will see directly.

Your task:
1. Use ONLY the "Context from car reviews" provided in the user message.
2. Combine everything you understood from the user's question with everything relevant you retrieved from the sources into ONE coherent, verbal answer—as a car expert would say to a friend in the chat. Do not output raw snippets, bullet-only lists, or separate fragments; write a unified answer (paragraphs and/or structured sections) that directly answers the question.
3. Respond in the same language as the user (Hebrew or English). For comparison questions, provide a structured analysis with clear advantages for each vehicle, still in one cohesive reply.
4. If the context is empty or irrelevant, say clearly that you have no information from your knowledge base for this question.
""" + session_models_instruction + """
The user expects to see this single aggregated answer in the chat—make it complete and concrete."""
        user_prompt = f"""Context from car reviews:
{context_text if context_text.strip() else "(No matching chunks found.)"}

Previous conversation context (last turns):
{conversation_context}

User question: {query}

{comparison_prompt}

Synthesize the context above into one clear, verbal answer that aggregates all relevant information and directly answers the user's question. Your entire response will be shown to the user as the chat reply:"""
        PIPELINE_LOG.info("prepare_generation END refusal=%s has_system_prompt=%s has_user_prompt=%s steps=%d",
                          False, bool(system_prompt), bool(user_prompt), len(steps_log))
        return (None, system_prompt, user_prompt, steps_log)

    def generate_response(self, query: str, history, api_key: str):
        """יצירת תשובה חכמה עם כל 10 העצות"""
        if not api_key:
            return "Error: Gemini API Key is missing."
        
        # Prepare processing log for UX transparency
        processing_steps = []

        # Check cache for identical queries
        cache_key = self._get_cache_key(query)
        if cache_key in self.response_cache:
            # Return cached response but include note about cache
            cached = self.response_cache[cache_key]
            return f"🔁 Returned cached result\n\n{cached}"

        genai.configure(api_key=api_key)

        # Step 1 - Normalization
        processing_steps.append("🔍 Normalizing car names...")
        canonical = self._normalize_car_name(query)
        if canonical:
            search_query = canonical
        else:
            search_query = query

        # עצה 7: זיהוי שאלות השוואתיות
        is_comparison = self._is_comparison_question(query)

        # Policy guard: do not recommend models without auto.co.il articles in our KB
        ordered_supported = self._get_ordered_supported_canonicals_in_text(query)
        # Show user which cars were identified (for comparison: both; for single: one)
        if is_comparison:
            if len(ordered_supported) >= 2:
                names = ", ".join(self.CANONICAL_TO_DISPLAY.get(c, c) for c in ordered_supported[:2])
                processing_steps.append(f"✅ זיהוי דגמים להשוואה: {names}")
            elif len(ordered_supported) == 1:
                one_display = self.CANONICAL_TO_DISPLAY.get(ordered_supported[0], ordered_supported[0])
                processing_steps.append(f"✅ זיהוי דגם אחד (השני לא ברשימה): {one_display}")
        else:
            if canonical:
                processing_steps.append(f"✅ Recognized canonical id: {canonical}")
            else:
                processing_steps.append("ℹ️ No canonical car found; using full query for search")
        if is_comparison:
            if len(ordered_supported) == 0:
                return self._unsupported_car_refusal(query, is_comparison=True)
        else:
            if not canonical and not ordered_supported and self._looks_like_specific_car_question(query):
                return self._unsupported_car_refusal(query, is_comparison=False)
        
        if is_comparison:
            if len(ordered_supported) >= 2:
                car1_can, car2_can = ordered_supported[0], ordered_supported[1]
                car1_display = self.CANONICAL_TO_DISPLAY.get(car1_can, car1_can)
                car2_display = self.CANONICAL_TO_DISPLAY.get(car2_can, car2_can)
                processing_steps.append("📊 Extracting structured comparison data...")
                comparison_data = self._extract_comparison_data(car1_can, car2_can)
                context_results = self._hybrid_search(search_query, top_k=self.max_chunks_comparison)

                comparison_prompt = f"""
Based on the car reviews, create a structured comparison between {car1_display} and {car2_display}:

Format your response as:
**יתרונות {car1_display}:**
- [list advantages]

**יתרונות {car2_display}:**
- [list advantages]

**המלצה לפי פרופיל משתמש:**
- [personalized recommendation]

Structured Data:
{json.dumps(comparison_data, ensure_ascii=False, indent=2)}

Context from reviews:
"""
            elif len(ordered_supported) == 1:
                one_can = ordered_supported[0]
                one_display = self.CANONICAL_TO_DISPLAY.get(one_can, one_can)
                processing_steps.append(f"📋 One supported model ({one_display}); providing info only for it")
                context_results = self._hybrid_search(one_can, top_k=self.max_chunks_general)
                if self._is_hebrew(query):
                    comparison_prompt = f"""
המשתמש ביקש השוואה. אחד הדגמים (או יותר) שהוא ציין **לא נמצא בבסיס הידע שלי** – אי אפשר להשוות לדגמים שלא למדתי עליהם.
בתשובתך: ציין בקצרה שאינך יכול להשוות לדגמים שלא למדת עליהם, ואז ספק מידע מלא רק על הדגם שכן נמצא ברשימה: **{one_display}**, בהתבסס על ההקשר למטה.

Context from reviews:
"""
                else:
                    comparison_prompt = f"""
The user asked for a comparison. One or more models they mentioned are **not in my knowledge base** – I cannot compare to models I haven't learned about.
In your response: briefly state that you cannot compare to models you haven't learned about, then provide full information only about the model that is in my list: **{one_display}**, based on the context below.

Context from reviews:
"""
            else:
                context_results = self._hybrid_search(search_query, top_k=self.max_chunks_general)
                comparison_prompt = "Answer in the same language as the user's question. "
        else:
            processing_steps.append("🔎 Searching knowledge base (hybrid vectors + keywords)...")
            context_results = self._hybrid_search(search_query, top_k=self.max_chunks_general)
            comparison_prompt = ""
        
        # בנייה של הקשר עם מטא-דאטה
        context_text = ""
        for r in context_results:
            meta = r['metadata']
            context_text += f"""
Source: {meta['title']}
Topic: {meta['topic']}
Content: {r['text'][:self.max_context_chars_per_chunk]}...

"""
        
        # עצה 10: שמירת הקשר מהיסטוריה
        conversation_context = self._get_context_from_history()
        
        # עצה 9: פרומפט מותאם
        system_prompt = """You are an expert automotive assistant. Your answer MUST be based only on the "Context from car reviews" provided in the user message.
Your task: aggregate and summarize the information from that context and give a detailed, verbal answer as a car expert would to a friend. Always output a full paragraph (or more) that directly answers the user's question—never leave the answer empty or vague.
Respond in the same language as the user (Hebrew or English). For comparison questions, provide a structured analysis with clear advantages for each vehicle.
If the context is empty or irrelevant, say you have no information from your knowledge base for this question."""
        
        prompt = f"""Context from car reviews:
{context_text if context_text.strip() else "(No matching chunks found.)"}

Previous conversation context (last turns):
{conversation_context}

User question: {query}

{comparison_prompt}

Based on the context above, provide a clear answer that aggregates the information and answers the user's question:"""

        # Prepare generation step
        processing_steps.append("💭 Generating response with Gemini...")

        # Use new rate-limited API call with backoff and caching
        # Prefer fast Flash models for latency; fall back only within Flash tier.
        models_to_try = ['gemini-2.0-flash', 'gemini-1.5-flash']
        response_text = self._call_api_with_backoff(system_prompt, prompt, models_to_try)
        
        # Cache only successful responses (avoid caching transient rate limit/errors)
        if not (response_text.startswith("⚠️") or response_text.startswith("❌")):
            self.response_cache[cache_key] = response_text

        # עצה 10: שמירת התשובה בהיסטוריה
        self._maintain_conversation_history(query, response_text)

        # Prepend processing steps for UX transparency
        processing_header = "\n".join(processing_steps)
        full_response = f"{processing_header}\n\n{response_text}"
        return full_response

    def generate_response_stream(self, query: str, history, api_key: str):
        """
        Stream progress: show each pipeline step as it completes (no generic placeholders).
        Only one LLM call at the end; normalization, comparison detection, and search are offline/rule-based.
        """
        if not api_key:
            yield "Error: Gemini API Key is missing."
            return

        def steps_text() -> str:
            return "\n".join(processing_steps)

        processing_steps: List[str] = []

        cache_key = self._get_cache_key(query)
        if cache_key in self.response_cache:
            yield f"🔁 Returned cached result\n\n{self.response_cache[cache_key]}"
            return

        genai.configure(api_key=api_key)

        # --- Step 1: Normalization (rule-based, no LLM) ---
        processing_steps.append("🔍 Normalizing car names...")
        yield steps_text()

        canonical = self._normalize_car_name(query)
        if canonical:
            search_query = canonical
        else:
            search_query = query

        # --- Step 2: Question type (rule-based regex/keywords, no LLM) ---
        is_comparison = self._is_comparison_question(query)
        if is_comparison:
            processing_steps.append("📋 Detected: comparison question (rule-based)")
        else:
            processing_steps.append("📋 Detected: single-model question (rule-based)")
        yield steps_text()

        ordered_supported = self._get_ordered_supported_canonicals_in_text(query)
        # Show user which cars were identified (for comparison: both; for single: one)
        if is_comparison:
            if len(ordered_supported) >= 2:
                names = ", ".join(self.CANONICAL_TO_DISPLAY.get(c, c) for c in ordered_supported[:2])
                processing_steps.append(f"✅ זיהוי דגמים להשוואה: {names}")
            elif len(ordered_supported) == 1:
                one_display = self.CANONICAL_TO_DISPLAY.get(ordered_supported[0], ordered_supported[0])
                processing_steps.append(f"✅ זיהוי דגם אחד (השני לא ברשימה): {one_display}")
        else:
            if canonical:
                processing_steps.append(f"✅ Recognized canonical id: {canonical}")
            else:
                processing_steps.append("ℹ️ No canonical car found; using full query for search")
        yield steps_text()

        if is_comparison:
            if len(ordered_supported) == 0:
                yield self._unsupported_car_refusal(query, is_comparison=True)
                return
        else:
            if not canonical and not ordered_supported and self._looks_like_specific_car_question(query):
                yield self._unsupported_car_refusal(query, is_comparison=False)
                return

        # --- Step 3: Search (index built offline; only query embedding at runtime) ---
        processing_steps.append("🔎 Searching knowledge base (vectors + keywords)...")
        yield steps_text()

        comparison_prompt = ""
        context_results = []

        if is_comparison:
            if len(ordered_supported) >= 2:
                car1_can, car2_can = ordered_supported[0], ordered_supported[1]
                car1_display = self.CANONICAL_TO_DISPLAY.get(car1_can, car1_can)
                car2_display = self.CANONICAL_TO_DISPLAY.get(car2_can, car2_can)
                processing_steps.append("📊 Extracting structured comparison data (regex)...")
                yield steps_text()

                comparison_data = self._extract_comparison_data(car1_can, car2_can)
                context_results = self._hybrid_search(search_query, top_k=self.max_chunks_comparison)
                processing_steps.append(f"✅ Retrieved {len(context_results)} chunks for comparison")
                yield steps_text()

                comparison_prompt = f"""
Based on the car reviews, create a structured comparison between {car1_display} and {car2_display}:

Format your response as:
**יתרונות {car1_display}:**
- [list advantages]

**יתרונות {car2_display}:**
- [list advantages]

**המלצה לפי פרופיל משתמש:**
- [personalized recommendation]

Structured Data:
{json.dumps(comparison_data, ensure_ascii=False, indent=2)}

Context from reviews:
"""
            elif len(ordered_supported) == 1:
                one_can = ordered_supported[0]
                one_display = self.CANONICAL_TO_DISPLAY.get(one_can, one_can)
                processing_steps.append(f"📋 One supported model ({one_display}); providing info only for it")
                context_results = self._hybrid_search(one_can, top_k=self.max_chunks_general)
                processing_steps.append(f"✅ Retrieved {len(context_results)} chunks")
                yield steps_text()
                if self._is_hebrew(query):
                    comparison_prompt = f"""
המשתמש ביקש השוואה. אחד הדגמים (או יותר) שהוא ציין **לא נמצא בבסיס הידע שלי** – אי אפשר להשוות לדגמים שלא למדתי עליהם.
בתשובתך: ציין בקצרה שאינך יכול להשוות לדגמים שלא למדת עליהם, ואז ספק מידע מלא רק על הדגם שכן נמצא ברשימה: **{one_display}**, בהתבסס על ההקשר למטה.

Context from reviews:
"""
                else:
                    comparison_prompt = f"""
The user asked for a comparison. One or more models they mentioned are **not in my knowledge base** – I cannot compare to models I haven't learned about.
In your response: briefly state that you cannot compare to models you haven't learned about, then provide full information only about the model that is in my list: **{one_display}**, based on the context below.

Context from reviews:
"""
            else:
                context_results = self._hybrid_search(search_query, top_k=self.max_chunks_general)
                comparison_prompt = "Answer in the same language as the user's question. "
                processing_steps.append(f"✅ Retrieved {len(context_results)} chunks")
                yield steps_text()
        else:
            context_results = self._hybrid_search(search_query, top_k=self.max_chunks_general)
            processing_steps.append(f"✅ Retrieved {len(context_results)} relevant chunks")
            yield steps_text()

        # --- Build prompt (no LLM) ---
        context_text = ""
        for r in context_results:
            meta = r['metadata']
            context_text += f"""
Source: {meta['title']}
Topic: {meta['topic']}
Content: {r['text'][:self.max_context_chars_per_chunk]}...

"""
        conversation_context = self._get_context_from_history()
        system_prompt = """You are an expert automotive assistant. Your answer MUST be based only on the "Context from car reviews" provided in the user message.
Your task: aggregate and summarize the information from that context and give a detailed, verbal answer as a car expert would to a friend. Always output a full paragraph (or more) that directly answers the user's question—never leave the answer empty or vague.
Respond in the same language as the user (Hebrew or English). For comparison questions, provide a structured analysis with clear advantages for each vehicle.
If the context is empty or irrelevant, say you have no information from your knowledge base for this question."""
        prompt = f"""Context from car reviews:
{context_text if context_text.strip() else "(No matching chunks found.)"}

Previous conversation context (last turns):
{conversation_context}

User question: {query}

{comparison_prompt}

Based on the context above, provide a clear answer that aggregates the information and answers the user's question:"""

        # --- Step 4: Single LLM call (streamed) ---
        processing_steps.append("💭 Generating response with Gemini...")
        yield steps_text()

        models_to_try = ['gemini-2.0-flash', 'gemini-1.5-flash']
        response_text = ""
        for partial in self._call_api_with_backoff_stream(system_prompt, prompt, models_to_try):
            response_text = partial
            yield f"{steps_text()}\n\n{response_text}"

        # Don't cache errors or timeouts
        if not any(response_text.startswith(p) for p in ("⚠️", "❌", "⏱️")):
            self.response_cache[cache_key] = response_text
        self._maintain_conversation_history(query, response_text)

        processing_steps.append("✅ Done")
        yield f"{steps_text()}\n\n{response_text}"


# Simple test block
if __name__ == "__main__":
    # Create dummy file if not exists for testing import
    if not os.path.exists("scraped_data.json"):
        print("No data found, skipping test.")
    else:
        engine = RAGEngine()
        res = engine.retrieve("How is the Kia EV9?")
        print(f"Top result: {res[0]['text'][:100]}...")