Spaces:

matis35
/

FFGEN-Demo

Sleeping

App Files Files Community

Matis Codjia commited on Jan 9

Commit

3ee62c8

1 Parent(s): 3a00bdd

Auto load

Browse files

Files changed (2) hide show

app.py +131 -459
cache_manager.py +43 -121

app.py CHANGED Viewed

@@ -1,42 +1,40 @@
 """
-Streamlit RAG Viewer avec Cache Intelligent
 """
 import streamlit as st
 import torch
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModel
-from datasets import load_dataset
 import chromadb
 from pathlib import Path
 import json
 import time
 import logging
 import sys
 # Import des modules custom
 from cache_manager import CacheManager
 from deepseek_caller import DeepSeekCaller
 from stats_logger import StatsLogger
 from config import DISTANCE_THRESHOLD
 from utils import load_css
-from huggingface_hub import login, snapshot_download
-import os
 # ==========================================
 # PAGE CONFIG
 # ==========================================
 st.set_page_config(
     page_title="RAG Feedback System",
-    page_icon="",
     layout="wide",
     initial_sidebar_state="expanded"
 )
 DATASET_ID = "matis35/chroma-rag-storage"
-REPO_FOLDER = "chroma_db_storage" # Le nom du dossier DANS le repo HF
-# Le dossier local où Streamlit va stocker la DB
-# On se met un niveau au-dessus pour que snapshot_download recrée le dossier "chroma_db_storage" dedans
 LOCAL_CACHE_DIR = Path("./chroma_cache")
 # ==========================================
@@ -48,45 +46,38 @@ load_css("assets/style.css")
 # STATE MANAGEMENT
 # ==========================================
 if 'model_loaded' not in st.session_state: st.session_state.model_loaded = False
-if 'dataset_loaded' not in st.session_state: st.session_state.dataset_loaded = False
 if 'db_initialized' not in st.session_state: st.session_state.db_initialized = False
 if 'cache_manager' not in st.session_state: st.session_state.cache_manager = None
 if 'deepseek_caller' not in st.session_state: st.session_state.deepseek_caller = None
 if 'stats_logger' not in st.session_state: st.session_state.stats_logger = StatsLogger()
 # ==========================================
-# HELPER FUNCTIONS
 # ==========================================
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s | %(levelname)s | %(message)s',
     datefmt='%H:%M:%S',
-    handlers=[
-        logging.StreamHandler(sys.stdout)
-    ]
 )
 logger = logging.getLogger("FFGen_System")
 hf_token = os.environ.get("HF_TOKEN")
 if hf_token:
-    # Se connecte explicitement
     login(token=hf_token)
-    print("Successfully connected to huggingface")
-else:
-    try:
-        if "HF_TOKEN" in st.secrets:
-            login(token=st.secrets["HF_TOKEN"])
-            print("Connected via st.secrets")
-        else:
-            print("No HF key found")
-    except FileNotFoundError:
-        print("Local execution without secrets")
 @st.cache_resource
 def load_full_model(model_path: str):
-    """Load standard HuggingFace model."""
-    st.info(f"Loading model from: {model_path}")
-    logger.info(f" Loading from: {model_path}...")
     try:
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         if tokenizer.pad_token is None:
@@ -97,196 +88,110 @@ def load_full_model(model_path: str):
             trust_remote_code=True,
             device_map="auto"
         )
-        logger.info(f"Modèle chargé avec succès !")
         model.eval()
         return model, tokenizer
     except Exception as e:
-        st.error(f"Erreur de chargement: {e}")
-        logger.error("Echec du chargement du modèle !")
         return None, None
 def encode_text(text: str, model, tokenizer):
-    """Encode text to embedding."""
     device = next(model.parameters()).device
     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
     inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
         outputs = model(**inputs)
         embeddings = outputs.last_hidden_state.mean(dim=1)
         embeddings = F.normalize(embeddings, p=2, dim=1)
     return embeddings[0].cpu().numpy().tolist()
-@st.cache_data
-def load_dataset_from_source(source: str, path: str):
-    logger.info(f"Source séléctionnée {source}")
-    if source == "HuggingFace Hub":
-        dataset = load_dataset(path)
-        data = []
-        for split in dataset.keys():
-            data.extend(dataset[split].to_list())
-        return data
-    else:
-        data = []
-        with open(path, 'r') as f:
-            for line in f:
-                if line.strip():
-                    data.append(json.loads(line))
-        return data
-# @st.cache_resource # <--- Décommenter si tu es sous Streamlit pour ne le faire qu'une fois !
 def initialize_chromadb():
     """
-    Mode Static RAG : Télécharge la DB depuis Hugging Face et se connecte en lecture seule.
     """
-    # 1. CHEMIN CIBLE
-    # Le chemin final sera : ./chroma_cache/chroma_db_storage
     final_db_path = LOCAL_CACHE_DIR / REPO_FOLDER
-    # 2. TÉLÉCHARGEMENT (Si pas déjà présent)
     if not final_db_path.exists():
-        print(f"📥 Téléchargement de la base depuis {DATASET_ID}...")
         try:
             snapshot_download(
                 repo_id=DATASET_ID,
                 repo_type="dataset",
-                local_dir=LOCAL_CACHE_DIR, # On télécharge DANS le cache
-                allow_patterns=[f"{REPO_FOLDER}/*"], # On ne prend que le dossier DB
-                local_dir_use_symlinks=False,
-                # token=os.environ.get("HF_TOKEN") # Nécessaire si le dataset est PRIVÉ
             )
-            print("Téléchargement terminé.")
         except Exception as e:
-            print(f"❌ Erreur de téléchargement : {e}")
-            # Fallback : Si on est en local et que le dossier existe déjà ailleurs, on pourrait pointer dessus
             raise e
-    # 3. CONNEXION CHROMA
-    # On pointe vers le dossier contenant le fichier sqlite3
     client = chromadb.PersistentClient(path=str(final_db_path))
-    # 4. RÉCUPÉRATION DE LA COLLECTION
-    # Attention : On ne fait plus de "create_collection" ni de "delete".
-    # On récupère juste ce qui existe.
     try:
         collection = client.get_collection(name="feedbacks")
-        print(f"📊 Collection chargée. {collection.count()} documents disponibles.")
     except Exception as e:
-        print(f"❌ Erreur : La collection 'feedbacks' n'existe pas dans la base téléchargée.")
         raise e
     return client, collection
 # ==========================================
-# MAIN APP
 # ==========================================
 st.title("FFGEN")
 st.markdown("### Submit code and get instant feedback")
-# ==========================================
-# SIDEBAR - CONFIGURATION
-# ==========================================
 with st.sidebar:
-    st.header(" Configuration")
-    # --- MODEL SELECTION ---
-    st.subheader("Embedding Model")
-    model_path = st.text_input(
-        "Model Path (Local or HF)",
-        value="matis35/gemmaembedding-fgdor",
-        help="Path to embedding model"
-    )
-    # --- DATASET SELECTION ---
-    st.subheader("Dataset")
-    data_source = st.selectbox("Source", ["HuggingFace Hub", "Local JSONL"])
-    dataset_path = st.text_input("Dataset Path", value="matis35/SYNT_V4")
     st.divider()
-    # --- CACHE SETTINGS ---
-    st.subheader("Cache Settings")
-    # Permettre de modifier le threshold dynamiquement
     if 'custom_threshold' not in st.session_state:
         st.session_state.custom_threshold = DISTANCE_THRESHOLD
     custom_threshold = st.slider(
-        "Semantic distance threshold",
-        min_value=0.1,
-        max_value=1.0,
-        value=st.session_state.custom_threshold,
-        step=0.05,
-        help="Distance < threshold = HIT. Modifier cette valeur change le comportement du cache sans réindexer."
     )
     if custom_threshold != st.session_state.custom_threshold:
         st.session_state.custom_threshold = custom_threshold
-        # Mettre à jour le threshold du cache manager existant si disponible
         if st.session_state.get('cache_manager'):
             st.session_state.cache_manager.threshold = custom_threshold
-        st.info(f"Threshold updated to {custom_threshold:.2f}")
-    st.caption(f"Current: Distance < {st.session_state.custom_threshold:.2f} = HIT")
     st.divider()
-    force_reindex = st.checkbox("Force Re-index", value=False)
-    col1, col2 = st.columns(2)
-    with col1:
-        load_btn = st.button("Load & Index", use_container_width=True)
-    with col2:
-        use_cached_btn = st.button(" Use Cached", use_container_width=True)
-    # --- LOAD CACHED DB ---
-    if use_cached_btn:
-        try:
-            client, collection = initialize_chromadb(force_reindex=False)
-            count = collection.count()
-            if count > 0:
-                st.session_state.client = client
-                st.session_state.collection = collection
-                st.session_state.db_initialized = True
-                st.success(f"DB Loaded: {count} docs")
-                logger.info(f"Base de données démarrée avec succès: {count} instances")
-                if not st.session_state.model_loaded:
-                    model, tokenizer = load_full_model(model_path)
-                    if model:
-                        st.session_state.model = model
-                        st.session_state.tokenizer = tokenizer
-                        st.session_state.model_loaded = True
-                        # Initialiser cache manager avec threshold dynamique
-                        encoder_fn = lambda text: encode_text(text, model, tokenizer)
-                        st.session_state.cache_manager = CacheManager(
-                            collection,
-                            encoder_fn,
-                            threshold=st.session_state.custom_threshold
-                        )
-                        # Initialiser DeepSeek caller
-                        try:
-                            st.session_state.deepseek_caller = DeepSeekCaller()
-                            st.success(" DeepSeek API Ready")
-                            logger.info("API prête")
-                        except Exception as e:
-                            st.warning(f" DeepSeek API unavailable: {e}")
-                            logger.error(f"API non disponible: {e}")
-            else:
-                st.warning(" Empty DB. Please Load & Index first.")
-        except Exception as e:
-            st.error(f"Error: {e}")
-            logger.error(f"Problème avec la base de données: {e}")
-    # --- LOAD AND INDEX ---
-    if load_btn:
-        with st.spinner("Loading Model..."):
             model, tokenizer = load_full_model(model_path)
             if model:
                 st.session_state.model = model
@@ -295,340 +200,107 @@ with st.sidebar:
             else:
                 st.stop()
-        with st.spinner("Loading Dataset..."):
-            logger.info("Chargement du dataset")
             try:
-                data = load_dataset_from_source(data_source, dataset_path)
-                st.session_state.dataset = data
-                st.session_state.dataset_loaded = True
-            except Exception as e:
-                st.error(f"Dataset Error: {e}")
-                logger.error("Problème de chargement du dataset")
-                st.stop()
-        if st.session_state.dataset_loaded:
-            with st.spinner(f"Indexing {len(data)} items..."):
-                client, collection = initialize_chromadb(force_reindex=force_reindex)
-                batch_size = 64
-                progress_bar = st.progress(0)
-                for i in range(0, len(data), batch_size):
-                    batch = data[i:i+batch_size]
-                    feedbacks = [item.get("feedback", item.get("generated_feedback", "")) for item in batch]
-                    codes = [item.get("code") for item in batch]
-                    # IMPORTANT: Encode FEEDBACK for bi-encoder retrieval (code→feedback)
-                    embeddings = [encode_text(fb, model, tokenizer) for fb in feedbacks]
-                    # Store code as metadata for later comparison
-                    metadatas = [{"code": c if c else ""} for c in codes]
-                    ids = [f"id_{i+j}" for j in range(len(batch))]
-                    collection.add(
-                        embeddings=embeddings,
-                        documents=feedbacks,
-                        metadatas=metadatas,
-                        ids=ids
-                    )
-                    progress_bar.progress(min(1.0, (i + batch_size) / len(data)))
                 st.session_state.client = client
                 st.session_state.collection = collection
                 st.session_state.db_initialized = True
-                # Initialiser cache manager avec threshold dynamique
                 encoder_fn = lambda text: encode_text(text, model, tokenizer)
                 st.session_state.cache_manager = CacheManager(
                     collection,
                     encoder_fn,
                     threshold=st.session_state.custom_threshold
                 )
-                # Initialiser DeepSeek
                 try:
                     st.session_state.deepseek_caller = DeepSeekCaller()
                 except:
-                    pass
-                st.success(" Indexing Complete!")
-# ==========================================
-# MAIN INTERFACE - QUERY
-# ==========================================
 if st.session_state.db_initialized and st.session_state.cache_manager:
-    st.header(" Submit Your Code")
-    # Formulaire enrichi
     with st.form("code_submission"):
         col1, col2 = st.columns([2, 1])
-        with col1:
-            code_input = st.text_area(
-                "C Code",
-                height=300,
-                placeholder="Paste your C code here...",
-                help="The code you want feedback on"
-            )
-        with col2:
-            theme = st.text_input(
-                "Exercise Theme",
-                placeholder="e.g., Binary Search",
-                help="What is this exercise about?"
-            )
-            difficulty = st.selectbox(
-                "Difficulty Level",
-                ["beginner", "intermediate", "advanced"]
-            )
-            error_category = st.text_input(
-                "Error Category (optional)",
-                placeholder="e.g., Off-by-one Error",
-                help="If you know the type of error"
-            )
-        instructions = st.text_area(
-            "Exercise Instructions (optional)",
-            placeholder="Describe what the function should do...",
-            help="Helps generate better feedback on cache miss"
-        )
-        col1, col2 = st.columns(2)
         with col1:
-            test_scope = st.text_input(
-                "Test Cases Scope (optional)",
-                placeholder="e.g., Test with n=0, n=5, n=10",
-                help="What tests should pass"
-            )
         with col2:
-            failed_tests = st.text_input(
-                "Failed Tests (optional)",
-                placeholder="e.g., Test n=0 returns wrong value",
-                help="Which tests are failing"
-            )
-        submit_btn = st.form_submit_button(" Search Feedback", use_container_width=True)
-    # TRAITEMENT DE LA REQUÊTE
     if submit_btn and code_input:
         start_time = time.time()
-        # Contexte complet
         context = {
-            "code": code_input,
-            "theme": theme or "N/A",
-            "difficulty": difficulty,
-            "error_category": error_category or "Unknown",
-            "instructions": instructions or "No instructions provided",
-            "test_cases_scope": [test_scope] if test_scope else [],
-            "failed_tests": [failed_tests] if failed_tests else []
         }
-        # Query cache
-        with st.spinner(" Searching cache..."):
             cache_result = st.session_state.cache_manager.query_cache(code_input, context)
-        response_time = (time.time() - start_time) * 1000  # ms
-        #  CACHE HIT ou PERFECT MATCH
         if cache_result['status'] in ['hit', 'perfect_match']:
-            is_perfect = cache_result['status'] == 'perfect_match'
-            st.markdown('<div class="hit-card">', unsafe_allow_html=True)
-            if is_perfect:
-                st.markdown("### PERFECT CODE MATCH - Exact Feedback Found")
-                st.success("The submitted code is identical (similarity > 95%) to a code in the database. This feedback is 100% accurate.")
-            else:
-                st.markdown("### Cache HIT - Feedback from Database")
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Confidence", f"{cache_result['confidence']:.2f}")
-            with col2:
-                st.metric("Best Match Distance (code→feedback)", f"{cache_result['similarity_scores'][0]:.4f}")
-            with col3:
-                st.metric("Response Time", f"{response_time:.0f} ms")
-            # Afficher code similarity si disponible
-            if cache_result.get('code_similarity') is not None:
-                st.metric("Code Similarity", f"{cache_result['code_similarity']:.4f}",
-                         help="Similarity between your code and reference code (1.0 = identical)")
-            if cache_result['needs_warning'] and not is_perfect:
-                st.warning(" **Note:** Confidence is moderate. Review carefully.")
-            # Afficher les résultats
-            for result in cache_result['results']:
-                # Calculer distance code_soumis ↔ code_référence
-                code_ref = result['code']
-                if code_ref and code_ref != 'N/A':
-                    code_ref_embedding = encode_text(code_ref, st.session_state.model, st.session_state.tokenizer)
-                    code_submitted_embedding = encode_text(code_input, st.session_state.model, st.session_state.tokenizer)
-                    # Cosine similarity
-                    import numpy as np
-                    similarity = np.dot(code_ref_embedding, code_submitted_embedding)
-                    code_distance = 1 - similarity
-                else:
-                    code_distance = None
-                with st.expander(f" Match #{result['rank']} (code→feedback distance: {result['distance']:.4f})"):
-                    # Métriques côte à côte
-                    col1, col2 = st.columns(2)
-                    with col1:
-                        st.metric("Code → Feedback", f"{result['distance']:.4f}", help="Distance entre votre code et ce feedback (apprentissage bi-encoder)")
-                    with col2:
-                        if code_distance is not None:
-                            st.metric("Code → Code Ref", f"{code_distance:.4f}", help="Distance entre votre code et le code de référence pour ce feedback")
-                    st.markdown("**Feedback:**")
-                    st.write(result['feedback'])
-                    st.markdown("**Reference Code (this feedback was given for):**")
-                    st.code(result['code'], language='c')
-            st.markdown('</div>', unsafe_allow_html=True)
-            # Log stats
-            st.session_state.stats_logger.log_query({
-                "query_id": cache_result['query_id'],
-                "status": "hit",
-                "similarity_score": cache_result['similarity_scores'][0],
-                "confidence": cache_result['confidence'],
-                "response_time_ms": response_time,
-                "theme": theme,
-                "error_category": error_category,
-                "difficulty": difficulty,
-                "deepseek_tokens": 0,
-                "cache_size": st.session_state.collection.count()
-            })
-        #  CACHE MISS
-        elif cache_result['status'] == 'miss':
-            st.markdown('<div class="miss-card">', unsafe_allow_html=True)
-            st.markdown("###  Cache MISS - Generating New Feedback")
-            st.info(f" Closest match distance: {cache_result.get('closest_distance', 1.0):.4f} (threshold: {st.session_state.custom_threshold:.2f})")
-            # Afficher les codes les plus proches même en cas de miss
-            if cache_result['results']:
-                st.markdown("#### Closest matches found (but below threshold):")
-                for result in cache_result['results']:
-                    # Calculer distance code_soumis ↔ code_référence
-                    code_ref = result['code']
-                    if code_ref and code_ref != 'N/A':
-                        code_ref_embedding = encode_text(code_ref, st.session_state.model, st.session_state.tokenizer)
-                        code_submitted_embedding = encode_text(code_input, st.session_state.model, st.session_state.tokenizer)
-                        import numpy as np
-                        similarity = np.dot(code_ref_embedding, code_submitted_embedding)
-                        code_distance = 1 - similarity
-                    else:
-                        code_distance = None
-                    with st.expander(f"Match #{result['rank']} (code→feedback: {result['distance']:.4f})"):
-                        # Métriques côte à côte
-                        col1, col2 = st.columns(2)
-                        with col1:
-                            st.metric("Code → Feedback", f"{result['distance']:.4f}", help="Distance bi-encoder (apprentissage)")
-                        with col2:
-                            if code_distance is not None:
-                                st.metric("Code → Code Ref", f"{code_distance:.4f}", help="Distance code soumis vs code de référence")
-                        st.markdown("**Feedback (given for reference code):**")
-                        st.write(result['feedback'])
-                        st.markdown("**Reference Code:**")
-                        st.code(result['code'], language='c')
-                st.divider()
-            # Appeler DeepSeek
             if st.session_state.deepseek_caller:
-                with st.spinner(" Generating feedback with DeepSeek..."):
-                    deepseek_result = st.session_state.deepseek_caller.generate_feedback(context)
-                if deepseek_result.get('feedback'):
-                    feedback = deepseek_result['feedback']
-                    tokens_used = deepseek_result['tokens_total']
-                    st.success(" Feedback Generated!")
-                    col1, col2, col3 = st.columns(3)
-                    with col1:
-                        st.metric("Tokens Used", tokens_used)
-                    with col2:
-                        st.metric("Generation Time", f"{deepseek_result['generation_time_ms']:.0f} ms")
-                    with col3:
-                        st.metric("Total Time", f"{response_time + deepseek_result['generation_time_ms']:.0f} ms")
-                    st.markdown("**Generated Feedback:**")
                     st.write(feedback)
-                    # Distillation : Ajouter au cache
-                    with st.spinner(" Adding to cache (distillation)..."):
-                        # Encoder le feedback
-                        feedback_embedding = encode_text(feedback, st.session_state.model, st.session_state.tokenizer)
-                        success = st.session_state.cache_manager.add_to_cache(
-                            code=code_input,
-                            feedback=feedback,
-                            metadata=context,
-                            embedding=feedback_embedding
-                        )
-                        if success:
-                            st.success(" Feedback added to cache for future queries!")
-                    # Log cache miss (format dataset)
-                    miss_data = {
-                        **context,
-                        "tags": [tag.strip() for tag in error_category.split(',') if tag.strip()] if error_category else [],
-                        "feedback": feedback,
-                        "query_id": cache_result['query_id'],
-                        "tokens_used": tokens_used
-                    }
-                    st.session_state.stats_logger.log_cache_miss(miss_data)
-                    # Log stats
-                    st.session_state.stats_logger.log_query({
-                        "query_id": cache_result['query_id'],
-                        "status": "miss",
-                        "similarity_score": cache_result.get('closest_distance', 1.0),
-                        "confidence": 1.0,  # LLM généré = haute confiance
-                        "response_time_ms": response_time + deepseek_result['generation_time_ms'],
-                        "theme": theme,
-                        "error_category": error_category,
-                        "difficulty": difficulty,
-                        "deepseek_tokens": tokens_used,
-                        "cache_size": st.session_state.collection.count()
-                    })
                 else:
-                    st.error(f" Error: {deepseek_result.get('error', 'Unknown error')}")
             else:
-                st.error(" DeepSeek API not available. Cannot generate feedback.")
-            st.markdown('</div>', unsafe_allow_html=True)
 else:
-    st.info(" Please configure and load the model + dataset from the sidebar first.")
-    st.markdown("""
-    ### How to use:
-    1. **Load Model & Dataset** (or use cached DB)
-    2. **Fill in the form** with your code and its context
-    3. **Submit** to get feedback
-    4. **Check the Stats page** to see cache performance
-    ### Cache System:
-    -  **Hit**: Similar code found in database (instant response) Or Relevant feedabck code found in db with code feedback embedder
-    -  **Miss**: No match found, generates new feedback (slower, uses API tokens)
-    -  **Distillation**: New feedbacks are automatically added to the cache
-    """)

 """
+Streamlit RAG Viewer avec Cache Intelligent (Static RAG Mode)
 """
 import streamlit as st
 import torch
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModel
 import chromadb
 from pathlib import Path
 import json
 import time
 import logging
 import sys
+import os
+from huggingface_hub import login, snapshot_download
 # Import des modules custom
 from cache_manager import CacheManager
 from deepseek_caller import DeepSeekCaller
 from stats_logger import StatsLogger
 from config import DISTANCE_THRESHOLD
 from utils import load_css
 # ==========================================
 # PAGE CONFIG
 # ==========================================
 st.set_page_config(
     page_title="RAG Feedback System",
+    page_icon="🧠",
     layout="wide",
     initial_sidebar_state="expanded"
 )
+# Configuration du Dataset HF contenant la DB Chroma
 DATASET_ID = "matis35/chroma-rag-storage"
+REPO_FOLDER = "chroma_db_storage"
 LOCAL_CACHE_DIR = Path("./chroma_cache")
 # ==========================================
 # STATE MANAGEMENT
 # ==========================================
 if 'model_loaded' not in st.session_state: st.session_state.model_loaded = False
 if 'db_initialized' not in st.session_state: st.session_state.db_initialized = False
 if 'cache_manager' not in st.session_state: st.session_state.cache_manager = None
 if 'deepseek_caller' not in st.session_state: st.session_state.deepseek_caller = None
 if 'stats_logger' not in st.session_state: st.session_state.stats_logger = StatsLogger()
 # ==========================================
+# SETUP & LOGGING
 # ==========================================
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s | %(levelname)s | %(message)s',
     datefmt='%H:%M:%S',
+    handlers=[logging.StreamHandler(sys.stdout)]
 )
 logger = logging.getLogger("FFGen_System")
+# Authentification HF
 hf_token = os.environ.get("HF_TOKEN")
+if not hf_token and "HF_TOKEN" in st.secrets:
+    hf_token = st.secrets["HF_TOKEN"]
 if hf_token:
     login(token=hf_token)
+# ==========================================
+# CORE FUNCTIONS
+# ==========================================
 @st.cache_resource
 def load_full_model(model_path: str):
+    """Charge le modèle d'embedding (Hugging Face)"""
+    st.info(f"Loading embedding model from: {model_path}...")
     try:
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         if tokenizer.pad_token is None:
             trust_remote_code=True,
             device_map="auto"
         )
         model.eval()
         return model, tokenizer
     except Exception as e:
+        st.error(f"Failed to load model: {e}")
         return None, None
 def encode_text(text: str, model, tokenizer):
+    """Génère l'embedding normalisé"""
     device = next(model.parameters()).device
     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
     inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
         outputs = model(**inputs)
         embeddings = outputs.last_hidden_state.mean(dim=1)
         embeddings = F.normalize(embeddings, p=2, dim=1)
     return embeddings[0].cpu().numpy().tolist()
+@st.cache_resource
 def initialize_chromadb():
     """
+    Télécharge la DB Chroma pré-calculée depuis Hugging Face.
+    Plus de re-indexation manuelle ici.
     """
     final_db_path = LOCAL_CACHE_DIR / REPO_FOLDER
+    # 1. Téléchargement si absent
     if not final_db_path.exists():
+        print(f"📥 Downloading vector DB from {DATASET_ID}...")
         try:
             snapshot_download(
                 repo_id=DATASET_ID,
                 repo_type="dataset",
+                local_dir=LOCAL_CACHE_DIR,
+                allow_patterns=[f"{REPO_FOLDER}/*"],
+                local_dir_use_symlinks=False
             )
+            print("✅ Download complete.")
         except Exception as e:
+            st.error(f"Failed to download DB: {e}")
             raise e
+    # 2. Connexion
+    print(f"🔌 Connecting to ChromaDB at {final_db_path}")
     client = chromadb.PersistentClient(path=str(final_db_path))
+    # 3. Vérification
     try:
         collection = client.get_collection(name="feedbacks")
+        print(f"📊 Collection loaded. Documents: {collection.count()}")
     except Exception as e:
+        st.error("Collection 'feedbacks' not found in the downloaded DB.")
         raise e
     return client, collection
 # ==========================================
+# MAIN INTERFACE
 # ==========================================
 st.title("FFGEN")
 st.markdown("### Submit code and get instant feedback")
+# --- SIDEBAR ---
 with st.sidebar:
+    st.header("⚙️ System Configuration")
+    # Model Config
+    model_path = st.text_input("Embedding Model", value="matis35/gemmaembedding-fgdor")
     st.divider()
+    # Cache Sensitivity
+    st.subheader("Cache Sensitivity")
     if 'custom_threshold' not in st.session_state:
         st.session_state.custom_threshold = DISTANCE_THRESHOLD
     custom_threshold = st.slider(
+        "Similarity Threshold", 0.1, 1.0,
+        value=st.session_state.custom_threshold, step=0.05,
+        help="Lower = Stricter matching. Higher = More matches."
     )
     if custom_threshold != st.session_state.custom_threshold:
         st.session_state.custom_threshold = custom_threshold
         if st.session_state.get('cache_manager'):
             st.session_state.cache_manager.threshold = custom_threshold
     st.divider()
+    # Active Learning Toggle
+    enable_learning = st.checkbox(
+        "Enable Active Learning",
+        value=True,
+        help="If checked, new feedbacks generated by DeepSeek will be added to the local cache for this session."
+    )
+    st.divider()
+    # Main Action Button
+    start_btn = st.button("🚀 Load System", use_container_width=True, type="primary")
+    if start_btn:
+        # 1. Load Model
+        with st.spinner("1/2 Loading Neural Model..."):
             model, tokenizer = load_full_model(model_path)
             if model:
                 st.session_state.model = model
             else:
                 st.stop()
+        # 2. Download & Connect DB
+        with st.spinner("2/2 Downloading & Connecting Vector DB..."):
             try:
+                client, collection = initialize_chromadb() # Appel sans argument !
                 st.session_state.client = client
                 st.session_state.collection = collection
                 st.session_state.db_initialized = True
+                # Init Cache Manager
                 encoder_fn = lambda text: encode_text(text, model, tokenizer)
                 st.session_state.cache_manager = CacheManager(
                     collection,
                     encoder_fn,
                     threshold=st.session_state.custom_threshold
                 )
+                # Init DeepSeek
                 try:
                     st.session_state.deepseek_caller = DeepSeekCaller()
                 except:
+                    st.warning("DeepSeek key not found, generation disabled.")
+                st.success("System Ready!")
+                time.sleep(1) # Petit temps pour voir le succès
+                st.rerun()
+            except Exception as e:
+                st.error(f"Initialization Error: {e}")
+# --- MAIN LOGIC ---
 if st.session_state.db_initialized and st.session_state.cache_manager:
+    # Formulaire de soumission
     with st.form("code_submission"):
         col1, col2 = st.columns([2, 1])
         with col1:
+            code_input = st.text_area("C Code", height=300, placeholder="int main() { ... }")
         with col2:
+            theme = st.text_input("Theme", placeholder="e.g. Arrays")
+            difficulty = st.selectbox("Difficulty", ["beginner", "intermediate", "advanced"])
+            error_cat = st.text_input("Error Type (Optional)")
+        instructions = st.text_area("Instructions", placeholder="Function should return...")
+        submit_btn = st.form_submit_button("Search Feedback", use_container_width=True)
     if submit_btn and code_input:
         start_time = time.time()
         context = {
+            "code": code_input, "theme": theme,
+            "difficulty": difficulty, "error_category": error_cat,
+            "instructions": instructions
         }
+        # 1. Query Cache
+        with st.spinner("🔍 Searching knowledge base..."):
             cache_result = st.session_state.cache_manager.query_cache(code_input, context)
+        elapsed = (time.time() - start_time) * 1000
+        # CAS 1: HIT ou PERFECT MATCH
         if cache_result['status'] in ['hit', 'perfect_match']:
+            st.success(f"Feedback found in {elapsed:.0f}ms (Confidence: {cache_result['confidence']:.2f})")
+            # Affichage des résultats (Top 1)
+            best = cache_result['results'][0]
+            st.markdown("### 💡 Retrieved Feedback")
+            st.write(best['feedback'])
+            with st.expander("See Reference Code"):
+                st.code(best['code'], language='c')
+                st.caption(f"Distance: {best['distance']:.4f}")
+        # CAS 2: MISS -> GENERATION
+        else:
+            st.warning(f"No similar feedback found (Best distance: {cache_result.get('closest_distance', 1.0):.4f}). Generating new...")
             if st.session_state.deepseek_caller:
+                with st.spinner("🤖 Generating analysis with DeepSeek..."):
+                    gen_result = st.session_state.deepseek_caller.generate_feedback(context)
+                if 'feedback' in gen_result:
+                    feedback = gen_result['feedback']
+                    st.markdown("### 🤖 Generated Feedback")
                     st.write(feedback)
+                    # LOGIQUE D'APPRENTISSAGE (DISTILLATION)
+                    if enable_learning:
+                        with st.spinner("💾 Saving to local session cache..."):
+                            emb = encode_text(feedback, st.session_state.model, st.session_state.tokenizer)
+                            st.session_state.cache_manager.add_to_cache(
+                                code=code_input,
+                                feedback=feedback,
+                                metadata=context,
+                                embedding=emb
+                            )
+                        st.toast("Feedback added to cache!", icon="✅")
                 else:
+                    st.error("Generation failed.")
             else:
+                st.error("DeepSeek not configured.")
 else:
+    st.info("👈 Please load the system from the sidebar to start.")

cache_manager.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """
-Cache Manager - Gère Hit/Miss et distillation
 """
 import numpy as np
-from typing import Dict, List, Any, Tuple
 import uuid
 from datetime import datetime
 from config import DISTANCE_THRESHOLD, TOP_K_RESULTS, CONFIDENCE_THRESHOLD_WARNING
@@ -14,61 +14,31 @@ class CacheManager:
         Args:
             chroma_collection: Collection ChromaDB
             encoder_fn: Fonction pour encoder du texte en embedding
-            threshold: Custom similarity threshold (if None, uses config default)
         """
         self.collection = chroma_collection
         self.encoder_fn = encoder_fn
         self.threshold = threshold if threshold is not None else DISTANCE_THRESHOLD
     def calculate_confidence(self, distances: List[float]) -> float:
-        """
-        Calcule un score de confiance basé sur les distances.
-        Distance plus faible = confiance plus haute.
-        Returns:
-            float entre 0 et 1
-        """
         if not distances:
             return 0.0
-        # Distance moyenne
         avg_distance = np.mean(distances)
-        # Convertir distance en confiance (inverse et normalisation)
-        # Distance de 0 = confiance 1.0
-        # Distance de 0.5 = confiance 0.5
-        # Distance de 1.0 = confiance 0.0
-        confidence = max(0.0, 1.0 - avg_distance)
-        return round(confidence, 3)
     def query_cache(self, code: str, context: Dict[str, Any]) -> Dict[str, Any]:
         """
-        Logique d'exécution (Pipeline) :
-        1. CHECK RAPIDE : Match exact de la chaîne de caractères (via Metadata).
-           -> Si trouvé : Retour immédiat (Stop).
-        2. RETRIEVAL : Recherche des 5 vecteurs les plus proches (Bi-Encoder).
-        3. ANALYSE FINE : Sur ces 5 candidats, on vérifie :
-           A. Est-ce qu'il y a un "Jumeau Sémantique" ? (Code quasi-identique > 0.95)
-              -> Si oui : C'est un HIT forcé (Priorité sur le seuil).
-           B. Est-ce que le meilleur candidat est sous le seuil de distance ?
-              -> Si oui : C'est un HIT standard.
-        4. DÉCISION : Si ni A ni B -> MISS.
         """
-        # --- ÉTAPE 1 : CHECK RAPIDE (String Exact Match) ---
         try:
-            # On vérifie si la chaîne de caractères brute existe déjà
             if len(code) < 5000:
-                exact_matches = self.collection.get(
-                    where={"code": code},
-                    limit=1
-                )
                 if exact_matches and len(exact_matches['ids']) > 0:
-                    print("Cache: MATCH EXACT (String) trouvé !")
                     return {
                         "status": "perfect_match",
                         "results": [{
@@ -78,22 +48,15 @@ class CacheManager:
                             "rank": 1,
                             "metadata": exact_matches['metadatas'][0]
                         }],
-                        "similarity_scores": [0.0],
                         "confidence": 1.0,
-                        "needs_deepseek": False,
                         "needs_warning": False,
-                        "query_id": str(uuid.uuid4()),
-                        "query_embedding": [],
-                        "perfect_code_match": True
                     }
         except Exception as e:
-            print(f"Warning exact match: {e}")
-        # --- ÉTAPE 2 : RETRIEVAL (Recherche Vectorielle) ---
-        # On a besoin des candidats pour faire les analyses suivantes
         query_embedding = self.encoder_fn(code)
         query_results = self.collection.query(
             query_embeddings=[query_embedding],
             n_results=TOP_K_RESULTS
@@ -103,129 +66,88 @@ class CacheManager:
         documents = query_results['documents'][0] if query_results['documents'] else []
         metadatas = query_results['metadatas'][0] if query_results['metadatas'] else []
-        # --- ÉTAPE 3 : ANALYSE FINE (Code Similarity Check) ---
-        # On cherche un "Jumeau Sémantique" parmi les résultats retournés
-        code_similarity = None
         perfect_code_match = False
-        # On regarde uniquement le meilleur candidat (rank 1) pour la comparaison code-à-code
         if metadatas and metadatas[0].get('code'):
             ref_code = metadatas[0].get('code')
             if ref_code and ref_code != 'N/A':
                 ref_code_embedding = self.encoder_fn(ref_code)
-                # Produit scalaire
                 code_similarity = float(np.dot(query_embedding, ref_code_embedding))
-                # Si > 0.95, c'est le même code écrit différemment (ex: espaces, commentaires)
                 if code_similarity > 0.95:
                     perfect_code_match = True
-        # --- ÉTAPE 4 : DÉCISION HIT / MISS ---
-        # Condition A : Jumeau Sémantique (Le code est quasi identique)
-        # Condition B : Proximité Vectorielle Standard (Le sens est proche, sous le seuil)
         is_hit = False
         hit_type = "miss"
         if perfect_code_match:
             is_hit = True
-            hit_type = "perfect_match" # Priorité haute
         elif distances and distances[0] < self.threshold:
             is_hit = True
-            hit_type = "hit" # Priorité standard
-        # --- CONSTRUCTION DE LA RÉPONSE ---
-        # Préparation des résultats formatés (utilisé dans les deux cas)
         formatted_results = []
-        for i, (feedback, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
             formatted_results.append({
                 "rank": i + 1,
                 "feedback": feedback,
                 "code": metadata.get('code', 'N/A'),
-                "distance": round(distance, 4),
                 "metadata": metadata
             })
         if is_hit:
-            # Calcul confiance
             confidence = self.calculate_confidence(distances)
-            if perfect_code_match:
-                confidence = 1.0 # Boost max car on est sûr du code
             return {
                 "status": hit_type,
                 "results": formatted_results,
-                "similarity_scores": [round(d, 4) for d in distances],
-                "confidence": confidence,
-                "needs_deepseek": False,
-                # Warning uniquement si c'est un hit "mou" (vecteur lointain) ET pas un match de code
                 "needs_warning": False if perfect_code_match else (confidence < CONFIDENCE_THRESHOLD_WARNING),
-                "query_embedding": query_embedding,
-                "query_id": str(uuid.uuid4()),
-                "code_similarity": round(code_similarity, 4) if code_similarity is not None else None,
-                "perfect_code_match": perfect_code_match
             }
         else:
-            # MISS
             return {
                 "status": "miss",
-                "results": formatted_results, # On renvoie quand même les proches pour info
-                "similarity_scores": [round(d, 4) for d in distances] if distances else [],
                 "confidence": 0.0,
-                "needs_deepseek": True,
                 "needs_warning": False,
-                "query_embedding": query_embedding,
-                "query_id": str(uuid.uuid4()),
                 "closest_distance": round(distances[0], 4) if distances else 1.0
             }
     def add_to_cache(self, code: str, feedback: str, metadata: Dict[str, Any], embedding: List[float]) -> bool:
         """
-        Ajoute une nouvelle entrée au cache (distillation online).
-        Args:
-            code: Code source
-            feedback: Feedback généré
-            metadata: Métadonnées complètes (theme, difficulty, etc.)
-            embedding: Embedding du feedback
-        Returns:
-            bool: True si succès
         """
         try:
-            doc_id = f"miss_{uuid.uuid4()}"
-            # Préparer metadata pour ChromaDB (seulement le code car limitation)
-            chroma_metadata = {
-                "code": code,
                 "timestamp": datetime.now().isoformat(),
-                "source": "cache_miss"
             }
             self.collection.add(
                 embeddings=[embedding],
                 documents=[feedback],
-                metadatas=[chroma_metadata],
                 ids=[doc_id]
             )
             return True
         except Exception as e:
-            print(f"Error adding to cache: {e}")
-            return False
-    def get_cache_stats(self) -> Dict[str, Any]:
-        """Retourne des stats sur le cache"""
-        try:
-            total_docs = self.collection.count()
-            return {
-                "total_documents": total_docs,
-                "similarity_threshold": SIMILARITY_THRESHOLD,
-                "top_k": TOP_K_RESULTS
-            }
-        except Exception as e:
-            return {"error": str(e)}

 """
+Cache Manager - Gère Hit/Miss et distillation locale
 """
 import numpy as np
+from typing import Dict, List, Any
 import uuid
 from datetime import datetime
 from config import DISTANCE_THRESHOLD, TOP_K_RESULTS, CONFIDENCE_THRESHOLD_WARNING
         Args:
             chroma_collection: Collection ChromaDB
             encoder_fn: Fonction pour encoder du texte en embedding
+            threshold: Custom similarity threshold
         """
         self.collection = chroma_collection
         self.encoder_fn = encoder_fn
         self.threshold = threshold if threshold is not None else DISTANCE_THRESHOLD
     def calculate_confidence(self, distances: List[float]) -> float:
+        """Convertit la distance Chroma (Cosine) en score de confiance [0, 1]."""
         if not distances:
             return 0.0
+        # Avec hnsw:space="cosine", distance = 1 - similarity.
+        # Donc Similarity = 1 - distance.
         avg_distance = np.mean(distances)
+        return max(0.0, 1.0 - avg_distance)
     def query_cache(self, code: str, context: Dict[str, Any]) -> Dict[str, Any]:
         """
+        Recherche dans le cache (Pipeline Hybride: Exact Match -> Vector Search -> Code Comparison)
         """
+        # 1. CHECK RAPIDE (String Exact Match)
         try:
             if len(code) < 5000:
+                exact_matches = self.collection.get(where={"code": code}, limit=1)
                 if exact_matches and len(exact_matches['ids']) > 0:
                     return {
                         "status": "perfect_match",
                         "results": [{
                             "rank": 1,
                             "metadata": exact_matches['metadatas'][0]
                         }],
                         "confidence": 1.0,
                         "needs_warning": False,
+                        "closest_distance": 0.0
                     }
         except Exception as e:
+            print(f"Warning exact match check: {e}")
+        # 2. RETRIEVAL (Vectorielle)
         query_embedding = self.encoder_fn(code)
         query_results = self.collection.query(
             query_embeddings=[query_embedding],
             n_results=TOP_K_RESULTS
         documents = query_results['documents'][0] if query_results['documents'] else []
         metadatas = query_results['metadatas'][0] if query_results['metadatas'] else []
+        # 3. ANALYSE (Similarity Check)
         perfect_code_match = False
+        code_similarity = 0.0
+        # Vérification sémantique du code sur le meilleur candidat
         if metadatas and metadatas[0].get('code'):
             ref_code = metadatas[0].get('code')
             if ref_code and ref_code != 'N/A':
+                # On encode le code de référence pour comparer avec le code d'entrée
                 ref_code_embedding = self.encoder_fn(ref_code)
+                # Produit scalaire (approximatif si vecteurs normalisés)
                 code_similarity = float(np.dot(query_embedding, ref_code_embedding))
                 if code_similarity > 0.95:
                     perfect_code_match = True
+        # 4. DÉCISION
         is_hit = False
         hit_type = "miss"
         if perfect_code_match:
             is_hit = True
+            hit_type = "perfect_match"
         elif distances and distances[0] < self.threshold:
             is_hit = True
+            hit_type = "hit"
+        # Formatage des résultats
         formatted_results = []
+        for i, (feedback, metadata, dist) in enumerate(zip(documents, metadatas, distances)):
             formatted_results.append({
                 "rank": i + 1,
                 "feedback": feedback,
                 "code": metadata.get('code', 'N/A'),
+                "distance": round(dist, 4),
                 "metadata": metadata
             })
         if is_hit:
             confidence = self.calculate_confidence(distances)
+            if perfect_code_match: confidence = 1.0
             return {
                 "status": hit_type,
                 "results": formatted_results,
+                "confidence": round(confidence, 3),
                 "needs_warning": False if perfect_code_match else (confidence < CONFIDENCE_THRESHOLD_WARNING),
+                "closest_distance": round(distances[0], 4)
             }
         else:
             return {
                 "status": "miss",
+                "results": formatted_results,
                 "confidence": 0.0,
                 "needs_warning": False,
                 "closest_distance": round(distances[0], 4) if distances else 1.0
             }
     def add_to_cache(self, code: str, feedback: str, metadata: Dict[str, Any], embedding: List[float]) -> bool:
         """
+        Ajoute au cache local pour la session courante.
         """
         try:
+            doc_id = f"learned_{uuid.uuid4().hex[:8]}"
+            # Nettoyage des métadonnées (Chroma n'aime pas les listes/None)
+            safe_metadata = {
+                "code": code[:10000], # Limite de taille
                 "timestamp": datetime.now().isoformat(),
+                "source": "active_learning",
+                "theme": str(metadata.get("theme", "")),
+                "difficulty": str(metadata.get("difficulty", ""))
             }
             self.collection.add(
                 embeddings=[embedding],
                 documents=[feedback],
+                metadatas=[safe_metadata],
                 ids=[doc_id]
             )
+            print(f"✅ Learned new feedback: {doc_id}")
             return True
         except Exception as e:
+            print(f"❌ Error adding to cache: {e}")
+            return False