Spaces:

matis35
/

FFGEN-Demo

Sleeping

App Files Files Community

Matis Codjia commited on Jan 8

Commit

27bbd47

1 Parent(s): eb53740

Feat:Sources files

Browse files

Files changed (22) hide show

.gitignore +3 -0
README.md +83 -20
__init__.py +0 -0
app.py +586 -0
assets/style.css +66 -0
assets/style_search.css +52 -0
benchmarks/human_evaluation_results.csv +13 -0
cache_manager.py +195 -0
config.py +81 -0
data/.gitkeep +0 -0
datasets/dataset_c_piscine_semantic.jsonl +10 -0
deepseek_caller.py +110 -0
pages/benchmark.py +196 -0
pages/search.py +129 -0
pages/stats.py +276 -0
reload_config.py +18 -0
requirements.txt +6 -3
run.sh +15 -0
stats_logger.py +180 -0
test_system.py +162 -0
trend_analyzer.py +556 -0
utils.py +4 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+echo "chroma_db_storage/" >> .gitignore
+echo "*.sqlite3" >> .gitignore
+echo "__pycache__/" >> .gitignore

README.md CHANGED Viewed

@@ -1,20 +1,83 @@
----
-title: FFGEN Demo
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: Streamlit template space
-license: mit
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

+# Streamlit RAG Viewer
+Simple, colorful, and professional interface for semantic code feedback search.
+## Features
+- ✅ **Pure Streamlit** - No separate backend
+- 🎨 **Colorful & Adaptive** - Modern gradient UI
+- 🤖 **Base Model + PEFT** - Load 400M model + your adapter
+- 🔍 **Semantic Search** - ChromaDB vector search
+- 📦 **Flexible Data** - HuggingFace Hub or local JSONL
+## Quick Start
+```bash
+cd streamlit_rag_viewer
+# Install
+pip install -r requirements.txt
+# Run
+streamlit run app.py
+```
+Opens at **http://localhost:8501**
+## Usage
+### 1. Configure (Sidebar)
+- **Dataset**: Choose HuggingFace Hub or local JSONL
+- **Base Model**: `Salesforce/SFR-Embedding-Code-400M_R`
+- **PEFT Adapter**: Your adapter from Hub (e.g., `matis35/my-adapter`)
+### 2. Load & Index
+Click **🚀 Load & Index** - it will:
+1. Load base model (400M)
+2. Load your PEFT adapter
+3. Combine them
+4. Load dataset
+5. Index in ChromaDB
+### 3. Search
+- Paste code
+- Choose k results
+- Click **🔍 Search**
+## Architecture
+**Single Streamlit app** with:
+- Model: Base (400M) + PEFT adapter loaded together
+- Storage: ChromaDB (persistent in `.chroma_cache/`)
+- UI: Gradient colors, adaptive layout
+## Why No Backend?
+Everything runs in Streamlit:
+- Model loading
+- Encoding
+- ChromaDB
+- Search
+**No FastAPI needed** - Streamlit handles it all!
+## Example Adapter
+Your PEFT adapter structure on Hub:
+```
+matis35/my-code-adapter/
+├── adapter_config.json
+├── adapter_model.bin
+└── README.md
+```
+The app will:
+1. Load base: `AutoModel.from_pretrained("Salesforce/SFR-Embedding-Code-400M_R")`
+2. Load adapter: `PeftModel.from_pretrained(base_model, "matis35/my-code-adapter")`
+3. Use combined model for encoding
+## Enjoy! 🚀

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,586 @@

+"""
+Streamlit RAG Viewer avec Cache Intelligent
+"""
+import streamlit as st
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+from datasets import load_dataset
+import chromadb
+from pathlib import Path
+import json
+import time
+import logging
+import sys
+# Import des modules custom
+from cache_manager import CacheManager
+from deepseek_caller import DeepSeekCaller
+from stats_logger import StatsLogger
+from config import SIMILARITY_THRESHOLD
+from utils import load_css
+# ==========================================
+# PAGE CONFIG
+# ==========================================
+st.set_page_config(
+    page_title="RAG Feedback System",
+    page_icon="",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# ==========================================
+# CUSTOM CSS
+# ==========================================
+load_css("assets/style.css")
+# ==========================================
+# STATE MANAGEMENT
+# ==========================================
+if 'model_loaded' not in st.session_state: st.session_state.model_loaded = False
+if 'dataset_loaded' not in st.session_state: st.session_state.dataset_loaded = False
+if 'db_initialized' not in st.session_state: st.session_state.db_initialized = False
+if 'cache_manager' not in st.session_state: st.session_state.cache_manager = None
+if 'deepseek_caller' not in st.session_state: st.session_state.deepseek_caller = None
+if 'stats_logger' not in st.session_state: st.session_state.stats_logger = StatsLogger()
+# ==========================================
+# HELPER FUNCTIONS
+# ==========================================
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s | %(levelname)s | %(message)s',
+    datefmt='%H:%M:%S',
+    handlers=[
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger("FFGen_System")
+@st.cache_resource
+def load_full_model(model_path: str):
+    """Load standard HuggingFace model."""
+    st.info(f"Loading model from: {model_path}")
+    logger.info(f"Chargement du modèle depuis : {model_path}...")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModel.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            device_map="auto"
+        )
+        logger.info(f"Modèle chargé avec succès !")
+        model.eval()
+        return model, tokenizer
+    except Exception as e:
+        st.error(f"Erreur de chargement: {e}")
+        logger.error("Echec du chargement du modèle !")
+        return None, None
+def encode_text(text: str, model, tokenizer):
+    """Encode text to embedding."""
+    device = next(model.parameters()).device
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+        embeddings = outputs.last_hidden_state.mean(dim=1)
+        embeddings = F.normalize(embeddings, p=2, dim=1)
+    return embeddings[0].cpu().numpy().tolist()
+@st.cache_data
+def load_dataset_from_source(source: str, path: str):
+    logger.info(f"Source séléctionnée {source}")
+    if source == "HuggingFace Hub":
+        dataset = load_dataset(path)
+        data = []
+        for split in dataset.keys():
+            data.extend(dataset[split].to_list())
+        return data
+    else:
+        data = []
+        with open(path, 'r') as f:
+            for line in f:
+                if line.strip():
+                    data.append(json.loads(line))
+        return data
+def initialize_chromadb(force_reindex=False):
+    db_path = Path("streamlit_rag_viewer/chroma_db_storage")
+    db_path.mkdir(parents=True, exist_ok=True)
+    client = chromadb.PersistentClient(path=str(db_path))
+    try:
+        if force_reindex:
+            try: client.delete_collection("feedbacks")
+            except: pass
+            collection = client.create_collection(name="feedbacks")
+        else:
+            collection = client.get_collection(name="feedbacks")
+    except:
+        collection = client.create_collection(name="feedbacks")
+    return client, collection
+# ==========================================
+# MAIN APP
+# ==========================================
+st.title("FFGEN")
+st.markdown("### Submit code and get instant feedback")
+# ==========================================
+# SIDEBAR - CONFIGURATION
+# ==========================================
+with st.sidebar:
+    st.header(" Configuration")
+    # --- MODEL SELECTION ---
+    st.subheader("Embedding Model")
+    model_path = st.text_input(
+        "Model Path (Local or HF)",
+        value="matis35/gemmaembedding-fgdor",
+        help="Path to embedding model"
+    )
+    # --- DATASET SELECTION ---
+    st.subheader("Dataset")
+    data_source = st.selectbox("Source", ["HuggingFace Hub", "Local JSONL"])
+    dataset_path = st.text_input("Dataset Path", value="matis35/SYNT_V4")
+    st.divider()
+    # --- CACHE SETTINGS ---
+    st.subheader("Cache Settings")
+    # Permettre de modifier le threshold dynamiquement
+    if 'custom_threshold' not in st.session_state:
+        st.session_state.custom_threshold = SIMILARITY_THRESHOLD
+    custom_threshold = st.slider(
+        "Similarity Threshold",
+        min_value=0.1,
+        max_value=1.0,
+        value=st.session_state.custom_threshold,
+        step=0.05,
+        help="Distance < threshold = HIT. Modifier cette valeur change le comportement du cache sans réindexer."
+    )
+    if custom_threshold != st.session_state.custom_threshold:
+        st.session_state.custom_threshold = custom_threshold
+        # Mettre à jour le threshold du cache manager existant si disponible
+        if st.session_state.get('cache_manager'):
+            st.session_state.cache_manager.threshold = custom_threshold
+        st.info(f"Threshold updated to {custom_threshold:.2f}")
+    st.caption(f"Current: Distance < {st.session_state.custom_threshold:.2f} = HIT")
+    st.divider()
+    force_reindex = st.checkbox("Force Re-index", value=False)
+    col1, col2 = st.columns(2)
+    with col1:
+        load_btn = st.button("Load & Index", use_container_width=True)
+    with col2:
+        use_cached_btn = st.button(" Use Cached", use_container_width=True)
+    # --- LOAD CACHED DB ---
+    if use_cached_btn:
+        try:
+            client, collection = initialize_chromadb(force_reindex=False)
+            count = collection.count()
+            if count > 0:
+                st.session_state.client = client
+                st.session_state.collection = collection
+                st.session_state.db_initialized = True
+                st.success(f"DB Loaded: {count} docs")
+                logger.info(f"Base de données démarrée avec succès: {count} instances")
+                if not st.session_state.model_loaded:
+                    model, tokenizer = load_full_model(model_path)
+                    if model:
+                        st.session_state.model = model
+                        st.session_state.tokenizer = tokenizer
+                        st.session_state.model_loaded = True
+                        # Initialiser cache manager avec threshold dynamique
+                        encoder_fn = lambda text: encode_text(text, model, tokenizer)
+                        st.session_state.cache_manager = CacheManager(
+                            collection,
+                            encoder_fn,
+                            threshold=st.session_state.custom_threshold
+                        )
+                        # Initialiser DeepSeek caller
+                        try:
+                            st.session_state.deepseek_caller = DeepSeekCaller()
+                            st.success(" DeepSeek API Ready")
+                            logger.info("API prête")
+                        except Exception as e:
+                            st.warning(f" DeepSeek API unavailable: {e}")
+                            logger.error(f"API non disponible: {e}")
+            else:
+                st.warning(" Empty DB. Please Load & Index first.")
+        except Exception as e:
+            st.error(f"Error: {e}")
+            logger.error(f"Problème avec la base de données: {e}")
+    # --- LOAD AND INDEX ---
+    if load_btn:
+        with st.spinner("Loading Model..."):
+            model, tokenizer = load_full_model(model_path)
+            if model:
+                st.session_state.model = model
+                st.session_state.tokenizer = tokenizer
+                st.session_state.model_loaded = True
+            else:
+                st.stop()
+        with st.spinner("Loading Dataset..."):
+            logger.info("Chargement du dataset")
+            try:
+                data = load_dataset_from_source(data_source, dataset_path)
+                st.session_state.dataset = data
+                st.session_state.dataset_loaded = True
+            except Exception as e:
+                st.error(f"Dataset Error: {e}")
+                logger.error("Problème de chargement du dataset")
+                st.stop()
+        if st.session_state.dataset_loaded:
+            with st.spinner(f"Indexing {len(data)} items..."):
+                client, collection = initialize_chromadb(force_reindex=force_reindex)
+                batch_size = 64
+                progress_bar = st.progress(0)
+                for i in range(0, len(data), batch_size):
+                    batch = data[i:i+batch_size]
+                    feedbacks = [item.get("feedback", item.get("generated_feedback", "")) for item in batch]
+                    codes = [item.get("code") for item in batch]
+                    # IMPORTANT: Encode FEEDBACK for bi-encoder retrieval (code→feedback)
+                    embeddings = [encode_text(fb, model, tokenizer) for fb in feedbacks]
+                    # Store code as metadata for later comparison
+                    metadatas = [{"code": c if c else ""} for c in codes]
+                    ids = [f"id_{i+j}" for j in range(len(batch))]
+                    collection.add(
+                        embeddings=embeddings,
+                        documents=feedbacks,
+                        metadatas=metadatas,
+                        ids=ids
+                    )
+                    progress_bar.progress(min(1.0, (i + batch_size) / len(data)))
+                st.session_state.client = client
+                st.session_state.collection = collection
+                st.session_state.db_initialized = True
+                # Initialiser cache manager avec threshold dynamique
+                encoder_fn = lambda text: encode_text(text, model, tokenizer)
+                st.session_state.cache_manager = CacheManager(
+                    collection,
+                    encoder_fn,
+                    threshold=st.session_state.custom_threshold
+                )
+                # Initialiser DeepSeek
+                try:
+                    st.session_state.deepseek_caller = DeepSeekCaller()
+                except:
+                    pass
+                st.success(" Indexing Complete!")
+# ==========================================
+# MAIN INTERFACE - QUERY
+# ==========================================
+if st.session_state.db_initialized and st.session_state.cache_manager:
+    st.header(" Submit Your Code")
+    # Formulaire enrichi
+    with st.form("code_submission"):
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            code_input = st.text_area(
+                "C Code",
+                height=300,
+                placeholder="Paste your C code here...",
+                help="The code you want feedback on"
+            )
+        with col2:
+            theme = st.text_input(
+                "Exercise Theme",
+                placeholder="e.g., Binary Search",
+                help="What is this exercise about?"
+            )
+            difficulty = st.selectbox(
+                "Difficulty Level",
+                ["beginner", "intermediate", "advanced"]
+            )
+            error_category = st.text_input(
+                "Error Category (optional)",
+                placeholder="e.g., Off-by-one Error",
+                help="If you know the type of error"
+            )
+        instructions = st.text_area(
+            "Exercise Instructions (optional)",
+            placeholder="Describe what the function should do...",
+            help="Helps generate better feedback on cache miss"
+        )
+        col1, col2 = st.columns(2)
+        with col1:
+            test_scope = st.text_input(
+                "Test Cases Scope (optional)",
+                placeholder="e.g., Test with n=0, n=5, n=10",
+                help="What tests should pass"
+            )
+        with col2:
+            failed_tests = st.text_input(
+                "Failed Tests (optional)",
+                placeholder="e.g., Test n=0 returns wrong value",
+                help="Which tests are failing"
+            )
+        submit_btn = st.form_submit_button(" Search Feedback", use_container_width=True)
+    # TRAITEMENT DE LA REQUÊTE
+    if submit_btn and code_input:
+        start_time = time.time()
+        # Contexte complet
+        context = {
+            "code": code_input,
+            "theme": theme or "N/A",
+            "difficulty": difficulty,
+            "error_category": error_category or "Unknown",
+            "instructions": instructions or "No instructions provided",
+            "test_cases_scope": [test_scope] if test_scope else [],
+            "failed_tests": [failed_tests] if failed_tests else []
+        }
+        # Query cache
+        with st.spinner(" Searching cache..."):
+            cache_result = st.session_state.cache_manager.query_cache(code_input, context)
+        response_time = (time.time() - start_time) * 1000  # ms
+        #  CACHE HIT ou PERFECT MATCH
+        if cache_result['status'] in ['hit', 'perfect_match']:
+            is_perfect = cache_result['status'] == 'perfect_match'
+            st.markdown('<div class="hit-card">', unsafe_allow_html=True)
+            if is_perfect:
+                st.markdown("### PERFECT CODE MATCH - Exact Feedback Found")
+                st.success("The submitted code is identical (similarity > 95%) to a code in the database. This feedback is 100% accurate.")
+            else:
+                st.markdown("### Cache HIT - Feedback from Database")
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Confidence", f"{cache_result['confidence']:.2f}")
+            with col2:
+                st.metric("Best Match Distance (code→feedback)", f"{cache_result['similarity_scores'][0]:.4f}")
+            with col3:
+                st.metric("Response Time", f"{response_time:.0f} ms")
+            # Afficher code similarity si disponible
+            if cache_result.get('code_similarity') is not None:
+                st.metric("Code Similarity", f"{cache_result['code_similarity']:.4f}",
+                         help="Similarity between your code and reference code (1.0 = identical)")
+            if cache_result['needs_warning'] and not is_perfect:
+                st.warning(" **Note:** Confidence is moderate. Review carefully.")
+            # Afficher les résultats
+            for result in cache_result['results']:
+                # Calculer distance code_soumis ↔ code_référence
+                code_ref = result['code']
+                if code_ref and code_ref != 'N/A':
+                    code_ref_embedding = encode_text(code_ref, st.session_state.model, st.session_state.tokenizer)
+                    code_submitted_embedding = encode_text(code_input, st.session_state.model, st.session_state.tokenizer)
+                    # Cosine similarity
+                    import numpy as np
+                    similarity = np.dot(code_ref_embedding, code_submitted_embedding)
+                    code_distance = 1 - similarity
+                else:
+                    code_distance = None
+                with st.expander(f" Match #{result['rank']} (code→feedback distance: {result['distance']:.4f})"):
+                    # Métriques côte à côte
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.metric("Code → Feedback", f"{result['distance']:.4f}", help="Distance entre votre code et ce feedback (apprentissage bi-encoder)")
+                    with col2:
+                        if code_distance is not None:
+                            st.metric("Code → Code Ref", f"{code_distance:.4f}", help="Distance entre votre code et le code de référence pour ce feedback")
+                    st.markdown("**Feedback:**")
+                    st.write(result['feedback'])
+                    st.markdown("**Reference Code (this feedback was given for):**")
+                    st.code(result['code'], language='c')
+            st.markdown('</div>', unsafe_allow_html=True)
+            # Log stats
+            st.session_state.stats_logger.log_query({
+                "query_id": cache_result['query_id'],
+                "status": "hit",
+                "similarity_score": cache_result['similarity_scores'][0],
+                "confidence": cache_result['confidence'],
+                "response_time_ms": response_time,
+                "theme": theme,
+                "error_category": error_category,
+                "difficulty": difficulty,
+                "deepseek_tokens": 0,
+                "cache_size": st.session_state.collection.count()
+            })
+        #  CACHE MISS
+        elif cache_result['status'] == 'miss':
+            st.markdown('<div class="miss-card">', unsafe_allow_html=True)
+            st.markdown("###  Cache MISS - Generating New Feedback")
+            st.info(f" Closest match distance: {cache_result.get('closest_distance', 1.0):.4f} (threshold: {st.session_state.custom_threshold:.2f})")
+            # Afficher les codes les plus proches même en cas de miss
+            if cache_result['results']:
+                st.markdown("#### Closest matches found (but below threshold):")
+                for result in cache_result['results']:
+                    # Calculer distance code_soumis ↔ code_référence
+                    code_ref = result['code']
+                    if code_ref and code_ref != 'N/A':
+                        code_ref_embedding = encode_text(code_ref, st.session_state.model, st.session_state.tokenizer)
+                        code_submitted_embedding = encode_text(code_input, st.session_state.model, st.session_state.tokenizer)
+                        import numpy as np
+                        similarity = np.dot(code_ref_embedding, code_submitted_embedding)
+                        code_distance = 1 - similarity
+                    else:
+                        code_distance = None
+                    with st.expander(f"Match #{result['rank']} (code→feedback: {result['distance']:.4f})"):
+                        # Métriques côte à côte
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.metric("Code → Feedback", f"{result['distance']:.4f}", help="Distance bi-encoder (apprentissage)")
+                        with col2:
+                            if code_distance is not None:
+                                st.metric("Code → Code Ref", f"{code_distance:.4f}", help="Distance code soumis vs code de référence")
+                        st.markdown("**Feedback (given for reference code):**")
+                        st.write(result['feedback'])
+                        st.markdown("**Reference Code:**")
+                        st.code(result['code'], language='c')
+                st.divider()
+            # Appeler DeepSeek
+            if st.session_state.deepseek_caller:
+                with st.spinner(" Generating feedback with DeepSeek..."):
+                    deepseek_result = st.session_state.deepseek_caller.generate_feedback(context)
+                if deepseek_result.get('feedback'):
+                    feedback = deepseek_result['feedback']
+                    tokens_used = deepseek_result['tokens_total']
+                    st.success(" Feedback Generated!")
+                    col1, col2, col3 = st.columns(3)
+                    with col1:
+                        st.metric("Tokens Used", tokens_used)
+                    with col2:
+                        st.metric("Generation Time", f"{deepseek_result['generation_time_ms']:.0f} ms")
+                    with col3:
+                        st.metric("Total Time", f"{response_time + deepseek_result['generation_time_ms']:.0f} ms")
+                    st.markdown("**Generated Feedback:**")
+                    st.write(feedback)
+                    # Distillation : Ajouter au cache
+                    with st.spinner(" Adding to cache (distillation)..."):
+                        # Encoder le feedback
+                        feedback_embedding = encode_text(feedback, st.session_state.model, st.session_state.tokenizer)
+                        success = st.session_state.cache_manager.add_to_cache(
+                            code=code_input,
+                            feedback=feedback,
+                            metadata=context,
+                            embedding=feedback_embedding
+                        )
+                        if success:
+                            st.success(" Feedback added to cache for future queries!")
+                    # Log cache miss (format dataset)
+                    miss_data = {
+                        **context,
+                        "tags": [tag.strip() for tag in error_category.split(',') if tag.strip()] if error_category else [],
+                        "feedback": feedback,
+                        "query_id": cache_result['query_id'],
+                        "tokens_used": tokens_used
+                    }
+                    st.session_state.stats_logger.log_cache_miss(miss_data)
+                    # Log stats
+                    st.session_state.stats_logger.log_query({
+                        "query_id": cache_result['query_id'],
+                        "status": "miss",
+                        "similarity_score": cache_result.get('closest_distance', 1.0),
+                        "confidence": 1.0,  # LLM généré = haute confiance
+                        "response_time_ms": response_time + deepseek_result['generation_time_ms'],
+                        "theme": theme,
+                        "error_category": error_category,
+                        "difficulty": difficulty,
+                        "deepseek_tokens": tokens_used,
+                        "cache_size": st.session_state.collection.count()
+                    })
+                else:
+                    st.error(f" Error: {deepseek_result.get('error', 'Unknown error')}")
+            else:
+                st.error(" DeepSeek API not available. Cannot generate feedback.")
+            st.markdown('</div>', unsafe_allow_html=True)
+else:
+    st.info(" Please configure and load the model + dataset from the sidebar first.")
+    st.markdown("""
+    ### How to use:
+    1. **Load Model & Dataset** (or use cached DB)
+    2. **Fill in the form** with your code and its context
+    3. **Submit** to get feedback
+    4. **Check the Stats page** to see cache performance
+    ### Cache System:
+    -  **Hit**: Similar code found in database (instant response)
+    -  **Miss**: No match found, generates new feedback (slower, uses API tokens)
+    -  **Distillation**: New feedbacks are automatically added to the cache
+    """)

assets/style.css ADDED Viewed

	@@ -0,0 +1,66 @@

+:root {
+    --primary: #3b82f6;
+    --secondary: #8b5cf6;
+    --success: #10b981;
+    --warning: #f59e0b;
+    --danger: #ef4444;
+}
+.block-container {
+    padding-top: 2rem;
+    padding-bottom: 2rem;
+}
+h1 {
+    background: linear-gradient(135deg, #3b82f6 0%, #8b5cf6 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+}
+.hit-card {
+    background: linear-gradient(135deg, #10b981 0%, #059669 100%);
+    color: white;
+    padding: 1.5rem;
+    border-radius: 1rem;
+    margin: 1rem 0;
+    box-shadow: 0 4px 15px rgba(0,0,0,0.1);
+}
+.miss-card {
+    background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%);
+    color: white;
+    padding: 1.5rem;
+    border-radius: 1rem;
+    margin: 1rem 0;
+    box-shadow: 0 4px 15px rgba(0,0,0,0.1);
+}
+.result-header {
+    display: flex;
+    justify-content: space-between;
+    margin-bottom: 1rem;
+    font-weight: 600;
+}
+.confidence-badge {
+    background: rgba(255, 255, 255, 0.2);
+    padding: 0.2rem 0.8rem;
+    border-radius: 2rem;
+    font-size: 0.85rem;
+}
+.code-block {
+    background: rgba(0, 0, 0, 0.3);
+    padding: 1rem;
+    border-radius: 0.5rem;
+    font-family: monospace;
+    font-size: 0.85rem;
+    overflow-x: auto;
+    margin-top: 0.5rem;
+}
+.stButton > button {
+    background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%);
+    color: white;
+    border: none;
+}

assets/style_search.css ADDED Viewed

	@@ -0,0 +1,52 @@

+/* Style moderne des cartes de résultats */
+.result-card {
+    background: white;
+    border: 1px solid #e2e8f0;
+    border-left: 5px solid #3b82f6;
+    color: #1e293b;
+    padding: 1.5rem;
+    border-radius: 0.5rem;
+    margin: 1rem 0;
+    box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
+    transition: transform 0.2s;
+}
+.result-card:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1);
+}
+/* Style de la console de debug (Terminal-like) */
+.debug-terminal {
+    background-color: #1e1e1e;
+    color: #4ade80;
+    font-family: 'JetBrains Mono', 'Courier New', monospace;
+    padding: 15px;
+    border-radius: 8px;
+    font-size: 0.85rem;
+    margin-bottom: 10px;
+    border: 1px solid #333;
+    overflow-x: auto;
+    line-height: 1.5;
+}
+/* Badges de statut */
+.badge {
+    padding: 4px 12px;
+    border-radius: 9999px;
+    font-weight: 600;
+    font-size: 0.75rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+}
+.badge-success { background-color: #dcfce7; color: #166534; }
+.badge-warning { background-color: #fef9c3; color: #854d0e; }
+.badge-error { background-color: #fee2e2; color: #991b1b; }
+/* Titre avec dégradé */
+h1 {
+    background: linear-gradient(90deg, #3b82f6 0%, #8b5cf6 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+}

benchmarks/human_evaluation_results.csv ADDED Viewed

	@@ -0,0 +1,13 @@

+dataset_id,evaluator,timestamp,tech_validity,diagnostic_score,has_spoiler,is_maieutic,actionability,comment
+0,Expert_1,2026-01-08T11:23:21.960462,Oui (Correct),Cause Racine (ex: malloc manquant),False,True,Immédiat,Le feedback est un peu trop long
+0,Expert_1,2026-01-08T13:09:23.940171,Oui (Correct),Cause Racine (ex: malloc manquant),False,False,Immédiat,Feedback trop long
+1,Expert_1,2026-01-08T13:11:07.431838,Oui (Correct),Cause Racine (ex: malloc manquant),False,True,Immédiat,"Parfait
+@"
+2,Expert_1,2026-01-08T13:11:50.182131,Discutable (Imprécis),Cause Racine (ex: malloc manquant),False,True,Immédiat,Parfait
+3,Expert_1,2026-01-08T13:12:44.663000,Discutable (Imprécis),Cause Racine (ex: malloc manquant),False,True,Immédiat,Parfait
+4,Expert_1,2026-01-08T13:12:50.609774,Discutable (Imprécis),Cause Racine (ex: malloc manquant),False,True,Immédiat,Parfait
+5,Expert_1,2026-01-08T13:12:55.645751,Oui (Correct),Cause Racine (ex: malloc manquant),False,True,Immédiat,Parfait
+6,Expert_1,2026-01-08T13:13:00.590676,Oui (Correct),Cause Racine (ex: malloc manquant),False,True,Immédiat,Parfait
+7,Expert_1,2026-01-08T13:13:03.098498,Oui (Correct),Cause Racine (ex: malloc manquant),False,True,Immédiat,Parfait
+8,Expert_1,2026-01-08T13:13:04.732185,Oui (Correct),Cause Racine (ex: malloc manquant),False,True,Immédiat,Parfait
+9,Expert_1,2026-01-08T13:13:21.744115,Oui (Correct),Cause Racine (ex: malloc manquant),False,True,Immédiat,Parfait

cache_manager.py ADDED Viewed

	@@ -0,0 +1,195 @@

+"""
+Cache Manager - Gère Hit/Miss et distillation
+"""
+import numpy as np
+from typing import Dict, List, Any, Tuple
+import uuid
+from datetime import datetime
+from config import SIMILARITY_THRESHOLD, TOP_K_RESULTS, CONFIDENCE_THRESHOLD_WARNING
+class CacheManager:
+    def __init__(self, chroma_collection, encoder_fn, threshold=None):
+        """
+        Args:
+            chroma_collection: Collection ChromaDB
+            encoder_fn: Fonction pour encoder du texte en embedding
+            threshold: Custom similarity threshold (if None, uses config default)
+        """
+        self.collection = chroma_collection
+        self.encoder_fn = encoder_fn
+        self.threshold = threshold if threshold is not None else SIMILARITY_THRESHOLD
+    def calculate_confidence(self, distances: List[float]) -> float:
+        """
+        Calcule un score de confiance basé sur les distances.
+        Distance plus faible = confiance plus haute.
+        Returns:
+            float entre 0 et 1
+        """
+        if not distances:
+            return 0.0
+        # Distance moyenne
+        avg_distance = np.mean(distances)
+        # Convertir distance en confiance (inverse et normalisation)
+        # Distance de 0 = confiance 1.0
+        # Distance de 0.5 = confiance 0.5
+        # Distance de 1.0 = confiance 0.0
+        confidence = max(0.0, 1.0 - avg_distance)
+        return round(confidence, 3)
+    def query_cache(self, code: str, context: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Interroge le cache avec le code soumis.
+        Returns:
+            {
+                "status": "hit" | "miss" | "perfect_match",
+                "results": [...],  # Si hit
+                "similarity_scores": [...],  # Si hit
+                "confidence": float,
+                "needs_deepseek": bool,
+                "query_embedding": [...],
+                "query_id": str,
+                "code_similarity": float | None  # Similarité code-à-code si calculée
+            }
+        """
+        # Générer l'embedding du code soumis
+        query_embedding = self.encoder_fn(code)
+        # Requête ChromaDB
+        query_results = self.collection.query(
+            query_embeddings=[query_embedding],
+            n_results=TOP_K_RESULTS
+        )
+        # Extraire résultats
+        distances = query_results['distances'][0] if query_results['distances'] else []
+        documents = query_results['documents'][0] if query_results['documents'] else []
+        metadatas = query_results['metadatas'][0] if query_results['metadatas'] else []
+        # FALLBACK: Vérifier similarité code-à-code pour le meilleur match
+        # Si codes identiques (similarité ~1.0), forcer un HIT même si bi-encoder faible
+        code_similarity = None
+        perfect_code_match = False
+        if metadatas and metadatas[0].get('code'):
+            ref_code = metadatas[0].get('code')
+            if ref_code and ref_code != 'N/A':
+                # Calculer similarité entre code soumis et code de référence
+                ref_code_embedding = self.encoder_fn(ref_code)
+                code_similarity = float(np.dot(query_embedding, ref_code_embedding))
+                # Si codes quasi-identiques (similarité > 0.95), c'est un match parfait
+                if code_similarity > 0.95:
+                    perfect_code_match = True
+        # Déterminer Hit ou Miss
+        # HIT si: distance bi-encoder < threshold OU match parfait des codes
+        if distances and (distances[0] < self.threshold or perfect_code_match):
+            #  CACHE HIT
+            confidence = self.calculate_confidence(distances)
+            results = []
+            for i, (feedback, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
+                results.append({
+                    "rank": i + 1,
+                    "feedback": feedback,
+                    "code": metadata.get('code', 'N/A'),
+                    "distance": round(distance, 4),
+                    "metadata": metadata
+                })
+            # Déterminer le type de hit
+            hit_status = "perfect_match" if perfect_code_match else "hit"
+            return {
+                "status": hit_status,
+                "results": results,
+                "similarity_scores": [round(d, 4) for d in distances],
+                "confidence": 1.0 if perfect_code_match else confidence,  # Confiance max si match parfait
+                "needs_deepseek": False,
+                "needs_warning": False if perfect_code_match else (confidence < CONFIDENCE_THRESHOLD_WARNING),
+                "query_embedding": query_embedding,
+                "query_id": str(uuid.uuid4()),
+                "code_similarity": round(code_similarity, 4) if code_similarity is not None else None,
+                "perfect_code_match": perfect_code_match
+            }
+        else:
+            #  CACHE MISS
+            # Préparer les résultats les plus proches même en cas de miss
+            closest_results = []
+            for i, (feedback, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
+                closest_results.append({
+                    "rank": i + 1,
+                    "feedback": feedback,
+                    "code": metadata.get('code', 'N/A'),
+                    "distance": round(distance, 4),
+                    "metadata": metadata
+                })
+            return {
+                "status": "miss",
+                "results": closest_results,  # Inclure les résultats les plus proches
+                "similarity_scores": [round(d, 4) for d in distances] if distances else [],
+                "confidence": 0.0,
+                "needs_deepseek": True,
+                "needs_warning": False,
+                "query_embedding": query_embedding,
+                "query_id": str(uuid.uuid4()),
+                "closest_distance": round(distances[0], 4) if distances else 1.0
+            }
+    def add_to_cache(self, code: str, feedback: str, metadata: Dict[str, Any], embedding: List[float]) -> bool:
+        """
+        Ajoute une nouvelle entrée au cache (distillation online).
+        Args:
+            code: Code source
+            feedback: Feedback généré
+            metadata: Métadonnées complètes (theme, difficulty, etc.)
+            embedding: Embedding du feedback
+        Returns:
+            bool: True si succès
+        """
+        try:
+            doc_id = f"miss_{uuid.uuid4()}"
+            # Préparer metadata pour ChromaDB (seulement le code car limitation)
+            chroma_metadata = {
+                "code": code,
+                "timestamp": datetime.now().isoformat(),
+                "source": "cache_miss"
+            }
+            self.collection.add(
+                embeddings=[embedding],
+                documents=[feedback],
+                metadatas=[chroma_metadata],
+                ids=[doc_id]
+            )
+            return True
+        except Exception as e:
+            print(f"Error adding to cache: {e}")
+            return False
+    def get_cache_stats(self) -> Dict[str, Any]:
+        """Retourne des stats sur le cache"""
+        try:
+            total_docs = self.collection.count()
+            return {
+                "total_documents": total_docs,
+                "similarity_threshold": SIMILARITY_THRESHOLD,
+                "top_k": TOP_K_RESULTS
+            }
+        except Exception as e:
+            return {"error": str(e)}

config.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+Configuration for RAG Feedback System with Cache
+"""
+import os
+# ==========================================
+# CACHE CONFIGURATION
+# ==========================================
+SIMILARITY_THRESHOLD = 0.6  # Si distance < 0.3, considéré comme HIT
+CONFIDENCE_THRESHOLD_WARNING = 0.9  # Si confiance < 0.9, afficher warning
+TOP_K_RESULTS = 3  # Nombre de candidats similaires à retourner
+# ==========================================
+# DEEPSEEK API
+# ==========================================
+DEEPSEEK_API_KEY = os.environ.get('DEEPSEEK_API_KEY')
+DEEPSEEK_BASE_URL = "https://api.deepseek.com"
+DEEPSEEK_MODEL = "deepseek-chat"
+DEEPSEEK_TEMPERATURE = 0.7
+DEEPSEEK_MAX_TOKENS = 1500
+# ==========================================
+# DATA PATHS
+# ==========================================
+CACHE_MISS_LOG = "streamlit_rag_viewer/data/cache_miss.jsonl"
+STATS_LOG = "streamlit_rag_viewer/data/stats.jsonl"
+CHROMA_DB_PATH = "streamlit_rag_viewer/chroma_db_storage"
+# ==========================================
+# SYSTEM PROMPT (Instructeur)
+# ==========================================
+INSTRUCTOR_SYSTEM_PROMPT = """You are an expert C Programming Instructor helping students understand and fix bugs in their code.
+## YOUR ROLE
+You provide educational feedback to help students learn from their mistakes, not just fix the code.
+## FEEDBACK GUIDELINES
+1. **Educational Focus**: Explain the underlying concept, not just the solution
+2. **No Direct Solutions**: Never say "change line X to Y" - guide understanding instead
+3. **Conceptual Depth**: Reference domain definitions (e.g., what makes a number prime)
+4. **Diagnostic Approach**: Help students understand WHY the bug exists
+5. **Encouraging Tone**: Be supportive and constructive
+## RESPONSE FORMAT
+Provide a single, clear paragraph of feedback that:
+- Identifies the conceptual error
+- Explains the underlying principle
+- Guides the student toward understanding the fix
+- Keeps technical language appropriate for the student's level
+## EXAMPLE GOOD FEEDBACK
+"The bug relates to how arrays are passed to functions in C. When you use 'sizeof' on an array parameter, it doesn't give you the original array size—it returns the size of the pointer. Consider how array information is lost during function calls and what additional parameter you might need to track the actual array length."
+## AVOID
+- Direct code fixes:  "Change `return 1` to `return 0`"
+- Line-specific instructions:  "On line 5, modify..."
+- Giving away the answer:  "The problem is you start count at 1 instead of 0"
+"""
+INSTRUCTOR_USER_PROMPT_TEMPLATE = """**Student Submission:**
+**Exercise**: {theme}
+**Difficulty**: {difficulty}
+**Error Type**: {error_category}
+**Instructions**: {instructions}
+**Student's Buggy Code**:
+```c
+{code}
+```
+**Test Results**:
+- Test Scope: {test_cases_scope}
+- Failed Tests: {failed_tests}
+---
+**Task**: As an instructor, provide educational feedback to help this student understand and fix the bug. Focus on the underlying concepts, not the direct solution.
+"""

data/.gitkeep ADDED Viewed

File without changes

datasets/dataset_c_piscine_semantic.jsonl ADDED Viewed

	@@ -0,0 +1,10 @@

+{"theme": "Check for Prime Number", "error_category": "Incorrect Base Case Return Value", "instructions": "Write a C function that checks if a given integer is a prime number. A prime number is a natural number greater than 1 that has no positive divisors other than 1 and itself. The function should return 1 if the number is prime, and 0 otherwise. Ensure to handle edge cases correctly, such as numbers less than 2.", "code": "#include <stdio.h>\n#include <stdbool.h>\n\nint is_prime(int n) {\n    if (n <= 1) {\n        return 1;  // Target Bug: Should return 0 for non-prime numbers (n <= 1)\n    }\n    \n    for (int i = 2; i * i <= n; i++) {\n        if (n % i == 0) {\n            return 0;\n        }\n    }\n    return 1;\n}\n\nint main() {\n    int num;\n    printf(\"Enter an integer: \");\n    scanf(\"%d\", &num);\n    \n    if (is_prime(num)) {\n        printf(\"%d is a prime number.\\n\", num);\n    } else {\n        printf(\"%d is not a prime number.\\n\", num);\n    }\n    \n    return 0;\n}", "test_cases_scope": ["Input: 1", "Input: 0", "Input: -5", "Input: 2", "Input: 3", "Input: 4", "Input: 17", "Input: 100"], "failed_tests": ["Input: 1", "Input: 0", "Input: -5"], "feedback": "The bug in this code is related to the base case handling for prime number checking. In the function, when checking if a number is prime, it's crucial to correctly identify numbers that are not prime by definition. Specifically, prime numbers are defined as natural numbers greater than 1. This means any number less than or equal to 1 (including negative numbers, zero, and one) should be classified as non-prime. The current implementation incorrectly returns 1 (true) for these cases, which mislabels them as prime. To fix this, review the mathematical definition of prime numbers and ensure the base case returns the appropriate value (0 for false) when the input does not meet the criteria. This teaches the importance of aligning code logic with domain-specific definitions and carefully handling edge cases in conditional statements."}
+{"theme": "Array Processing", "error_category": "Logic Error", "instructions": "Write a function 'find_max' that takes an integer array and its size as arguments, and returns the maximum value in the array. The array may contain both positive and negative integers. If the array is empty (size 0), return the minimum possible integer value (INT_MIN).", "code": "#include <stdio.h>\n#include <limits.h>\n\nint find_max(int arr[], int n) {\n    if (n <= 0) {\n        return INT_MIN;\n    }\n    int max_val = arr[0];\n    for (int i = 1; i < n; i = i) {  // Target Bug: Infinite loop due to missing increment (i = i instead of i++)\n        if (arr[i] > max_val) {\n            max_val = arr[i];\n        }\n    }\n    return max_val;\n}\n\nint main() {\n    // Example usage (not part of core function, but included for context)\n    int test1[] = {3, 7, 2, 9, 1};\n    int size1 = 5;\n    printf(\"Max in test1: %d\\n\", find_max(test1, size1));\n    \n    int test2[] = {-5, -1, -10};\n    int size2 = 3;\n    printf(\"Max in test2: %d\\n\", find_max(test2, size2));\n    \n    return 0;\n}", "test_cases_scope": "The function should handle arrays of various sizes, including edge cases like empty arrays, single-element arrays, and arrays with negative values.", "failed_tests": [{"input": "arr = [3, 7, 2, 9, 1], n = 5", "expected_output": "9", "actual_output": "Program hangs (infinite loop), no output produced"}, {"input": "arr = [-5, -1, -10], n = 3", "expected_output": "-1", "actual_output": "Program hangs (infinite loop), no output produced"}, {"input": "arr = [42], n = 1", "expected_output": "42", "actual_output": "Program hangs (infinite loop), no output produced"}], "feedback": "The code correctly initializes the maximum value and has a proper loop condition to iterate through the array indices. However, the loop does not progress because the loop variable is not updated in each iteration. In C, for-loops require a well-defined update expression to ensure termination; without it, the loop condition remains true indefinitely if the initial condition is met. This highlights the importance of ensuring that loop control variables are modified appropriately to avoid infinite execution, which can cause programs to hang and fail to produce results."}
+{"theme": "N-Queens Problem (Backtracking)", "error_category": "Integer Overflow (Unchecked addition/multiplication)", "instructions": "Implement a function to count the number of solutions to the N-Queens problem using backtracking. The function should take an integer n (board size) and return the total number of valid queen placements. Use a standard backtracking approach with a 1D array to store column positions.", "code": "#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n\nint isSafe(int *board, int row, int col) {\n    for (int i = 0; i < row; i++) {\n        if (board[i] == col || abs(board[i] - col) == abs(i - row)) {\n            return 0;\n        }\n    }\n    return 1;\n}\n\nvoid solveNQueensUtil(int n, int row, int *board, int *count) {\n    if (row == n) {\n        (*count)++;\n        return;\n    }\n    for (int col = 0; col < n; col++) {\n        if (isSafe(board, row, col)) {\n            board[row] = col;\n            solveNQueensUtil(n, row + 1, board, count);\n        }\n    }\n}\n\nint totalNQueens(int n) {\n    if (n <= 0) return 0;\n    int *board = (int *)malloc(n * sizeof(int));\n    if (!board) return -1;\n    memset(board, -1, n * sizeof(int));\n    int count = 0;\n    solveNQueensUtil(n, 0, board, &count);\n    free(board);\n    return count;\n}\n\nint main() {\n    int n;\n    printf(\"Enter board size n: \");\n    scanf(\"%d\", &n);\n    int result = totalNQueens(n);\n    if (result >= 0) {\n        printf(\"Number of solutions for %d-Queens: %d\\n\", n, result);\n    } else {\n        printf(\"Memory allocation failed.\\n\");\n    }\n    return 0;\n}", "test_cases_scope": ["Test with n=1 (should return 1)", "Test with n=4 (should return 2)", "Test with n=8 (should return 92)", "Test with n=10 (should return 724)", "Test with n=12 (should return 14200)", "Test with n=14 (should return 365596)", "Test with n=0 or negative (should return 0)", "Memory check for n=10 (no leaks)"], "failed_tests": ["Test with n=12 (should return 14200, but may return incorrect value due to overflow)", "Test with n=14 (should return 365596, but may return incorrect value due to overflow)", "Larger n values (e.g., n=15 or higher) may produce wildly incorrect or negative results due to integer overflow in the count variable"], "feedback": "Your backtracking logic is correct, but consider the range of values your count variable can hold. For larger board sizes, the number of solutions grows rapidly—what happens when it exceeds the maximum value of an int? Think about how to handle potential overflow in arithmetic operations, especially when incrementing counters in recursive functions."}
+{"theme": "Bit Manipulation with Pointer Errors", "error_category": "Invalid Pointer Cast", "instructions": "Write a C function that performs bit operations on an integer: set a specific bit to 1, unset a specific bit to 0, and toggle a specific bit (flip its value). The function should take an integer pointer and an operation code, and modify the integer in place. The bug involves an incorrect pointer cast that leads to undefined behavior.", "code": "#include <stdio.h>\n#include <stdlib.h>\n\ntypedef enum {\n    SET_BIT,\n    UNSET_BIT,\n    TOGGLE_BIT\n} BitOperation;\n\nvoid bit_operation(int *num, BitOperation op, int bit_pos) {\n    if (bit_pos < 0 || bit_pos >= (int)(sizeof(int) * 8)) {\n        printf(\"Error: Bit position %d out of range.\\n\", bit_pos);\n        return;\n    }\n    \n    // Target Bug: Invalid pointer cast from int* to char*\n    // This cast is incorrect because it assumes char* points to the same memory layout as int*,\n    // leading to potential misalignment and undefined behavior when dereferencing.\n    char *byte_ptr = (char *)num;  // Bug: Casting int* to char* without proper handling\n    \n    // Attempt to access the specific byte containing the bit\n    int byte_index = bit_pos / 8;\n    int bit_in_byte = bit_pos % 8;\n    \n    // Dereferencing byte_ptr after invalid cast can cause issues\n    char target_byte = byte_ptr[byte_index];  // Potential misalignment or segfault\n    \n    switch (op) {\n        case SET_BIT:\n            target_byte |= (1 << bit_in_byte);\n            break;\n        case UNSET_BIT:\n            target_byte &= ~(1 << bit_in_byte);\n            break;\n        case TOGGLE_BIT:\n            target_byte ^= (1 << bit_in_byte);\n            break;\n        default:\n            printf(\"Error: Invalid operation.\\n\");\n            return;\n    }\n    \n    // Write back the modified byte\n    byte_ptr[byte_index] = target_byte;\n}\n\nint main() {\n    // Example usage\n    int value = 0;  // Binary: 0000...0000\n    \n    printf(\"Initial value: %d\\n\", value);\n    \n    bit_operation(&value, SET_BIT, 3);  // Should set bit 3 to 1\n    printf(\"After setting bit 3: %d\\n\", value);\n    \n    bit_operation(&value, TOGGLE_BIT, 3);  // Should toggle bit 3 back to 0\n    printf(\"After toggling bit 3: %d\\n\", value);\n    \n    bit_operation(&value, UNSET_BIT, 5);  // Should unset bit 5 (already 0)\n    printf(\"After unsetting bit 5: %d\\n\", value);\n    \n    return 0;\n}", "test_cases_scope": "The tests cover valid bit positions (0-31 for typical 32-bit ints), all operation types, and edge cases like out-of-range positions. The bug causes failures in memory access and alignment.", "failed_tests": [{"input": "bit_operation(&value, SET_BIT, 3) with value = 0", "expected_output": "value becomes 8 (binary 1000)", "actual_output": "Undefined behavior: may output 8, crash with segfault, or produce incorrect value due to misaligned access"}, {"input": "bit_operation(&value, TOGGLE_BIT, 10) with value = 1024", "expected_output": "value becomes 0 (toggling bit 10 in 1024)", "actual_output": "Undefined behavior: potential segfault or memory corruption"}, {"input": "bit_operation(&value, UNSET_BIT, 31) with value = 2147483648 on 32-bit system", "expected_output": "value becomes 0", "actual_output": "Undefined behavior: alignment error or crash"}, {"input": "Memory check with Valgrind or sanitizers", "expected_output": "No memory errors", "actual_output": "Errors reported: invalid read/write due to misaligned pointer access"}], "feedback": "The code attempts to manipulate individual bits by casting an integer pointer to a character pointer to access bytes directly. However, this approach overlooks critical aspects of pointer aliasing and memory alignment in C. When you cast between pointer types of different sizes and alignments (like from int* to char*), the compiler may not guarantee safe dereferencing because char* has different alignment requirements. This can lead to undefined behavior, such as segmentation faults or incorrect memory access, especially on architectures with strict alignment rules. The issue is not just about type conversion but about how pointers interpret memory layouts and the guarantees provided by the C standard regarding pointer arithmetic and access. Consider the implications of pointer casts on memory safety and alignment constraints."}
+{"theme": "Print Binary Representation of Int", "error_category": "Incorrect Base Case Return Value", "instructions": "Write a recursive function `print_binary` that takes an integer `n` and prints its binary representation to stdout. For example, input 5 should output '101'. The function should handle positive integers only (assume input is non-negative). Use recursion to break down the number by dividing by 2.", "code": "#include <stdio.h>\n\nvoid print_binary(int n) {\n    if (n == 0) {\n        printf(\"0\");\n        return;\n    }\n    \n    if (n == 1) {\n        printf(\"1\");\n        return;\n    }\n    \n    print_binary(n / 2);\n    printf(\"%d\", n % 2);\n}\n\nint main() {\n    int num;\n    printf(\"Enter a non-negative integer: \");\n    scanf(\"%d\", &num);\n    \n    if (num < 0) {\n        printf(\"Error: Input must be non-negative.\\n\");\n        return 1;\n    }\n    \n    printf(\"Binary representation: \");\n    print_binary(num);\n    printf(\"\\n\");\n    return 0;\n}", "test_cases_scope": ["Test with input 0: should output '0'", "Test with input 1: should output '1'", "Test with input 5: should output '101'", "Test with input 10: should output '1010'", "Test with input 2: should output '10'", "Test with input 7: should output '111'"], "failed_tests": ["Test with input 0: outputs '0' (correct)", "Test with input 1: outputs '1' (correct)", "Test with input 5: outputs '101' (correct)", "Test with input 10: outputs '1010' (correct)", "Test with input 2: outputs '10' (correct)", "Test with input 7: outputs '111' (correct)"], "feedback": "Your recursive function correctly handles the base cases for n == 0 and n == 1, but consider whether the base case logic aligns with the mathematical definition of binary representation. Think about what happens when n is exactly 1: does printing '1' and returning immediately ensure proper recursion for all inputs? Review how the recursive call and printing order work together, especially for numbers where the most significant bit might be affected by the base case handling."}
+{"theme": "Implement 'tail' (Last N Lines)", "error_category": "Array decay misunderstanding (sizeof on array parameter)", "instructions": "Write a function 'print_last_n_lines' that takes an array of strings (lines) and an integer n, and prints the last n lines. If n is greater than the number of lines, print all lines. The function should handle the array correctly.", "code": "#include <stdio.h>\n#include <string.h>\n\nvoid print_last_n_lines(char *lines[], int n) {\n    int total_lines = sizeof(lines) / sizeof(lines[0]);\n    int start_index;\n    \n    if (n <= 0) {\n        return;\n    }\n    \n    if (n > total_lines) {\n        start_index = 0;\n    } else {\n        start_index = total_lines - n;\n    }\n    \n    for (int i = start_index; i < total_lines; i++) {\n        printf(\"%s\\n\", lines[i]);\n    }\n}\n\nint main() {\n    char *test_lines[] = {\"Line 1\", \"Line 2\", \"Line 3\", \"Line 4\", \"Line 5\"};\n    printf(\"Last 3 lines:\\n\");\n    print_last_n_lines(test_lines, 3);\n    return 0;\n}", "test_cases_scope": ["Test with n=3 on a 5-element array (should print lines 3-5)", "Test with n=5 on a 5-element array (should print all lines)", "Test with n=10 on a 5-element array (should print all lines)", "Test with n=0 (should print nothing)", "Test with different array sizes to check boundary conditions"], "failed_tests": ["Test with n=3 on a 5-element array (prints incorrect lines or crashes)", "Test with n=5 on a 5-element array (prints incorrect lines or crashes)", "Test with n=10 on a 5-element array (prints incorrect lines or crashes)", "Test with different array sizes (consistently produces wrong output)"], "feedback": "Think about how arrays are passed to functions in C. When you use 'sizeof' on an array parameter inside a function, does it give you the size of the original array or something else? Consider what happens to array information during function calls and how you might need to pass additional information to handle this correctly."}
+{"theme": "Stack Implementation (Array-based)", "error_category": "Switch Case Fallthrough (Missing break)", "instructions": "Implement a stack using an array with functions to push, pop, and display elements. The program should present a menu to the user with options: 1) Push, 2) Pop, 3) Display, 4) Exit. Use a switch statement to handle the menu choices. The bug is a missing break statement in the switch case, causing unintended fallthrough behavior.", "code": "#include <stdio.h>\n#include <stdlib.h>\n\n#define MAX 5\n\nint stack[MAX];\nint top = -1;\n\nvoid push(int value) {\n    if (top == MAX - 1) {\n        printf(\"Stack Overflow!\\n\");\n    } else {\n        top++;\n        stack[top] = value;\n        printf(\"Pushed %d onto the stack.\\n\", value);\n    }\n}\n\nvoid pop() {\n    if (top == -1) {\n        printf(\"Stack Underflow!\\n\");\n    } else {\n        printf(\"Popped %d from the stack.\\n\", stack[top]);\n        top--;\n    }\n}\n\nvoid display() {\n    if (top == -1) {\n        printf(\"Stack is empty.\\n\");\n    } else {\n        printf(\"Stack elements: \");\n        for (int i = 0; i <= top; i++) {\n            printf(\"%d \", stack[i]);\n        }\n        printf(\"\\n\");\n    }\n}\n\nint main() {\n    int choice, value;\n    \n    while (1) {\n        printf(\"\\nMenu:\\n\");\n        printf(\"1. Push\\n\");\n        printf(\"2. Pop\\n\");\n        printf(\"3. Display\\n\");\n        printf(\"4. Exit\\n\");\n        printf(\"Enter your choice: \");\n        scanf(\"%d\", &choice);\n        \n        switch (choice) {\n            case 1:\n                printf(\"Enter value to push: \");\n                scanf(\"%d\", &value);\n                push(value);\n                // Missing break statement here\n            case 2:\n                pop();\n                break;\n            case 3:\n                display();\n                break;\n            case 4:\n                printf(\"Exiting...\\n\");\n                exit(0);\n            default:\n                printf(\"Invalid choice!\\n\");\n        }\n    }\n    \n    return 0;\n}", "test_cases_scope": ["Test 1: Push operation (e.g., push 10, then display)", "Test 2: Pop operation (e.g., push 20, then pop)", "Test 3: Multiple operations sequence (e.g., push 30, push 40, pop, display)", "Test 4: Invalid menu choice handling", "Test 5: Stack overflow and underflow conditions"], "failed_tests": ["Test 1: Input: Choose option 1, push 10 -> Expected Output: 'Pushed 10 onto the stack.' then menu again. Actual Output: 'Pushed 10 onto the stack.' followed immediately by 'Stack Underflow!' (since it falls through to case 2).", "Test 3: Input: Choose option 1, push 30; then choose option 1, push 40; then choose option 2 -> Expected Output: 'Pushed 30 onto the stack.', 'Pushed 40 onto the stack.', 'Popped 40 from the stack.'. Actual Output: After pushing 30, it falls through to pop() and may pop incorrectly or cause underflow, disrupting the sequence."], "feedback": "In C, switch cases execute sequentially until a break statement is encountered. Without break, control 'falls through' to the next case, which can lead to unintended multiple operations. Review how switch statements handle flow control to ensure each case is properly terminated."}
+{"theme": "Singly Linked List: Get Element at Index", "error_category": "Off-by-one Error (0-indexed vs 1-indexed)", "instructions": "Write a function `get_at_index` that returns the data value at a given index in a singly linked list. The list uses 0-based indexing (first node at index 0). Return -1 if the index is out of bounds.", "code": "#include <stdio.h>\n#include <stdlib.h>\n\ntypedef struct Node {\n    int data;\n    struct Node* next;\n} Node;\n\nNode* create_node(int value) {\n    Node* new_node = (Node*)malloc(sizeof(Node));\n    if (new_node == NULL) {\n        printf(\"Memory allocation failed\\n\");\n        return NULL;\n    }\n    new_node->data = value;\n    new_node->next = NULL;\n    return new_node;\n}\n\nint get_at_index(Node* head, int index) {\n    Node* current = head;\n    int count = 1;  // Bug: Should start at 0 for 0-based indexing\n    \n    while (current != NULL) {\n        if (count == index) {  // Bug: Compares count with index, but count starts at 1\n            return current->data;\n        }\n        current = current->next;\n        count++;\n    }\n    \n    return -1;  // Index out of bounds\n}\n\nint main() {\n    // Example usage\n    Node* head = create_node(10);\n    head->next = create_node(20);\n    head->next->next = create_node(30);\n    \n    printf(\"Element at index 0: %d\\n\", get_at_index(head, 0));\n    printf(\"Element at index 1: %d\\n\", get_at_index(head, 1));\n    printf(\"Element at index 2: %d\\n\", get_at_index(head, 2));\n    printf(\"Element at index 3: %d\\n\", get_at_index(head, 3));\n    \n    // Free memory (simplified for example)\n    free(head->next->next);\n    free(head->next);\n    free(head);\n    \n    return 0;\n}", "test_cases_scope": ["Test 1: Index 0 on list [10,20,30] -> Expected 10", "Test 2: Index 1 on list [10,20,30] -> Expected 20", "Test 3: Index 2 on list [10,20,30] -> Expected 30", "Test 4: Index 3 on list [10,20,30] -> Expected -1 (out of bounds)", "Test 5: Index -1 on list [10,20,30] -> Expected -1 (out of bounds)", "Test 6: Empty list, any index -> Expected -1"], "failed_tests": ["Test 1: Index 0 on list [10,20,30] -> Returns -1 instead of 10", "Test 2: Index 1 on list [10,20,30] -> Returns 10 instead of 20", "Test 3: Index 2 on list [10,20,30] -> Returns 20 instead of 30", "Test 4: Index 3 on list [10,20,30] -> Returns 30 instead of -1"], "feedback": "When implementing functions that use indices, carefully consider whether your indexing starts at 0 or 1. In C, linked lists and arrays typically use 0-based indexing, meaning the first element is at position 0. Ensure your counter variable and comparison logic align with this convention to avoid off-by-one errors."}
+{"theme": "Sieve of Eratosthenes (Primes up to N)", "error_category": "Semicolon after Loop/If (Empty body)", "instructions": "This code implements the Sieve of Eratosthenes algorithm to find all prime numbers up to a given integer N. It uses a boolean array to mark non-prime numbers, starting from 2. The code has a deliberate error where a semicolon is placed after a loop or if statement, causing an empty body. Your task is to analyze the code, identify the bug, and understand how it leads to incorrect behavior without fixing it directly.", "code": "#include <stdio.h>\n#include <stdlib.h>\n#include <stdbool.h>\n\nvoid sieve_of_eratosthenes(int n) {\n    if (n < 2) {\n        printf(\"No primes less than 2.\\n\");\n        return;\n    }\n    \n    bool *is_prime = (bool *)malloc((n + 1) * sizeof(bool));\n    if (is_prime == NULL) {\n        printf(\"Memory allocation failed.\\n\");\n        return;\n    }\n    \n    for (int i = 0; i <= n; i++) {\n        is_prime[i] = true;\n    }\n    is_prime[0] = false;\n    is_prime[1] = false;\n    \n    for (int p = 2; p * p <= n; p++) {\n        if (is_prime[p] == true); {\n            for (int i = p * p; i <= n; i += p) {\n                is_prime[i] = false;\n            }\n        }\n    }\n    \n    printf(\"Prime numbers up to %d:\\n\", n);\n    for (int i = 2; i <= n; i++) {\n        if (is_prime[i]) {\n            printf(\"%d \", i);\n        }\n    }\n    printf(\"\\n\");\n    \n    free(is_prime);\n}\n\nint main() {\n    int n;\n    printf(\"Enter a positive integer N: \");\n    scanf(\"%d\", &n);\n    sieve_of_eratosthenes(n);\n    return 0;\n}", "test_cases_scope": "The test cases cover various inputs to verify the correctness of the Sieve of Eratosthenes algorithm, including edge cases, small numbers, and typical values. They check for correct prime identification and proper memory management.", "failed_tests": ["Input: 10, Expected Output: Prime numbers up to 10: 2 3 5 7, Actual Output: Prime numbers up to 10: 2 3 4 5 6 7 8 9 10", "Input: 20, Expected Output: Prime numbers up to 20: 2 3 5 7 11 13 17 19, Actual Output: Prime numbers up to 20: 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20", "Input: 5, Expected Output: Prime numbers up to 5: 2 3 5, Actual Output: Prime numbers up to 5: 2 3 4 5"], "feedback": "The bug is a semicolon placed immediately after the 'if' condition in the marking loop, creating an empty body. This causes the inner 'for' loop, intended to mark multiples of prime numbers as non-prime, to always execute regardless of the condition. As a result, no numbers are properly marked as non-prime, leading the algorithm to output all numbers from 2 to N as primes. Focus on how control flow is affected by misplaced semicolons and ensure that loop or if bodies are correctly enclosed without unintended termination."}
+{"theme": "Flood Fill Algorithm", "error_category": "Shadowing Variable (Local variable hides parameter)", "instructions": "Implement a flood fill function that recursively fills a 2D grid of characters from a starting point (x, y) with a new character 'new_char', replacing the target character 'target_char'. The grid is represented as a 2D array with dimensions 'rows' and 'cols'. Use recursion to traverse up, down, left, and right, ensuring bounds checking to avoid out-of-range access.", "code": "#include <stdio.h>\n#include <stdlib.h>\n\nvoid flood_fill(char **grid, int rows, int cols, int x, int y, char target_char, char new_char) {\n    if (x < 0 || x >= rows || y < 0 || y >= cols) {\n        return;\n    }\n    if (grid[x][y] != target_char) {\n        return;\n    }\n    \n    grid[x][y] = new_char;\n    \n    int rows = rows;  // Shadowing variable: local 'rows' hides parameter 'rows'\n    int cols = cols;  // Shadowing variable: local 'cols' hides parameter 'cols'\n    \n    flood_fill(grid, rows, cols, x + 1, y, target_char, new_char);\n    flood_fill(grid, rows, cols, x - 1, y, target_char, new_char);\n    flood_fill(grid, rows, cols, x, y + 1, target_char, new_char);\n    flood_fill(grid, rows, cols, x, y - 1, target_char, new_char);\n}\n\nint main() {\n    // Example usage for testing\n    int rows = 5, cols = 5;\n    char **grid = (char **)malloc(rows * sizeof(char *));\n    for (int i = 0; i < rows; i++) {\n        grid[i] = (char *)malloc(cols * sizeof(char));\n        for (int j = 0; j < cols; j++) {\n            grid[i][j] = '.';\n        }\n    }\n    \n    // Set a region to fill\n    grid[1][1] = 'X';\n    grid[1][2] = 'X';\n    grid[2][1] = 'X';\n    grid[2][2] = 'X';\n    \n    printf(\"Before fill:\\n\");\n    for (int i = 0; i < rows; i++) {\n        for (int j = 0; j < cols; j++) {\n            printf(\"%c \", grid[i][j]);\n        }\n        printf(\"\\n\");\n    }\n    \n    flood_fill(grid, rows, cols, 1, 1, 'X', 'O');\n    \n    printf(\"\\nAfter fill:\\n\");\n    for (int i = 0; i < rows; i++) {\n        for (int j = 0; j < cols; j++) {\n            printf(\"%c \", grid[i][j]);\n        }\n        printf(\"\\n\");\n    }\n    \n    for (int i = 0; i < rows; i++) {\n        free(grid[i]);\n    }\n    free(grid);\n    return 0;\n}", "test_cases_scope": ["Test 1: Fill a 5x5 grid with a 2x2 'X' region starting at (1,1), expecting all 'X's replaced with 'O's", "Test 2: Fill a 3x3 grid with a single 'X' at (0,0), expecting only that cell changed", "Test 3: Fill a 4x4 grid with no 'X's, expecting no changes", "Test 4: Fill starting out of bounds, expecting no changes to grid"], "failed_tests": ["Test 1: Input grid with 'X's at (1,1), (1,2), (2,1), (2,2); Output grid has only starting cell (1,1) changed to 'O', others remain 'X'", "Test 2: Input grid with 'X' at (0,0); Output grid has cell (0,0) changed to 'O', but recursion stops early, missing adjacent cells if present"], "feedback": "In recursive functions, parameter variables are used to pass data between calls. If you declare a local variable with the same name as a parameter, it creates a shadowing effect where the local variable hides the parameter within its scope. This can lead to incorrect values being passed in recursive calls, especially when those values are meant to be constant (like array dimensions). Review how variable scope works in C and ensure that recursive calls receive the intended parameters without unintended modifications."}

deepseek_caller.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+DeepSeek API Wrapper for Feedback Generation
+"""
+from openai import OpenAI
+from typing import Dict, Any
+import time
+from config import (
+    DEEPSEEK_API_KEY,
+    DEEPSEEK_BASE_URL,
+    DEEPSEEK_MODEL,
+    DEEPSEEK_TEMPERATURE,
+    DEEPSEEK_MAX_TOKENS,
+    INSTRUCTOR_SYSTEM_PROMPT,
+    INSTRUCTOR_USER_PROMPT_TEMPLATE
+)
+class DeepSeekCaller:
+    def __init__(self):
+        if not DEEPSEEK_API_KEY:
+            raise ValueError("DEEPSEEK_API_KEY not found in environment")
+        self.client = OpenAI(
+            api_key=DEEPSEEK_API_KEY,
+            base_url=DEEPSEEK_BASE_URL
+        )
+    def generate_feedback(self, context: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Génère un feedback pédagogique pour un code bugué.
+        Args:
+            context: {
+                "theme": str,
+                "difficulty": str,
+                "error_category": str,
+                "instructions": str,
+                "code": str,
+                "test_cases_scope": list,
+                "failed_tests": list
+            }
+        Returns:
+            {
+                "feedback": str,
+                "tokens_prompt": int,
+                "tokens_completion": int,
+                "tokens_total": int,
+                "generation_time_ms": float
+            }
+        """
+        start_time = time.time()
+        # Formater le prompt
+        user_prompt = INSTRUCTOR_USER_PROMPT_TEMPLATE.format(
+            theme=context.get('theme', 'N/A'),
+            difficulty=context.get('difficulty', 'intermediate'),
+            error_category=context.get('error_category', 'Unknown'),
+            instructions=context.get('instructions', 'No instructions provided'),
+            code=context.get('code', ''),
+            test_cases_scope=str(context.get('test_cases_scope', [])),
+            failed_tests=str(context.get('failed_tests', []))
+        )
+        try:
+            response = self.client.chat.completions.create(
+                model=DEEPSEEK_MODEL,
+                messages=[
+                    {"role": "system", "content": INSTRUCTOR_SYSTEM_PROMPT},
+                    {"role": "user", "content": user_prompt}
+                ],
+                temperature=DEEPSEEK_TEMPERATURE,
+                max_tokens=DEEPSEEK_MAX_TOKENS
+            )
+            feedback = response.choices[0].message.content
+            usage = response.usage
+            generation_time = (time.time() - start_time) * 1000  # en ms
+            return {
+                "feedback": feedback,
+                "tokens_prompt": usage.prompt_tokens,
+                "tokens_completion": usage.completion_tokens,
+                "tokens_total": usage.total_tokens,
+                "generation_time_ms": generation_time
+            }
+        except Exception as e:
+            return {
+                "feedback": None,
+                "error": str(e),
+                "tokens_prompt": 0,
+                "tokens_completion": 0,
+                "tokens_total": 0,
+                "generation_time_ms": (time.time() - start_time) * 1000
+            }
+    def test_connection(self) -> bool:
+        """Test si l'API est accessible"""
+        try:
+            response = self.client.chat.completions.create(
+                model=DEEPSEEK_MODEL,
+                messages=[{"role": "user", "content": "Hi"}],
+                max_tokens=5
+            )
+            return True
+        except Exception as e:
+            print(f"API Test Failed: {e}")
+            return False

pages/benchmark.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import streamlit as st
+import pandas as pd
+import json
+import os
+from datetime import datetime
+# ==========================================
+# CONFIGURATION & CONSTANTS
+# ==========================================
+st.set_page_config(layout="wide", page_title="FFGen - Dataset Benchmark")
+# File paths
+DATASET_SOURCE = "datasets/dataset_c_piscine_semantic.jsonl"
+OUTPUT_CSV = "benchmarks/human_evaluation_results.csv"
+# Ensure directories exist
+os.makedirs("benchmarks", exist_ok=True)
+os.makedirs("datasets", exist_ok=True)
+# ==========================================
+# UTILITY FUNCTIONS
+# ==========================================
+@st.cache_data
+def load_dataset_sample(filepath, sample_size=100):
+    """Loads a sample of the dataset for evaluation"""
+    data = []
+    if os.path.exists(filepath):
+        with open(filepath, 'r') as f:
+            for i, line in enumerate(f):
+                if i >= sample_size: break
+                if line.strip():
+                    try:
+                        data.append(json.loads(line))
+                    except: continue
+    else:
+        st.error(f"File not found: {filepath}")
+        return pd.DataFrame()
+    return pd.DataFrame(data)
+def save_evaluation(eval_data):
+    """Saves a single evaluation line"""
+    df = pd.DataFrame([eval_data])
+    if not os.path.exists(OUTPUT_CSV):
+        df.to_csv(OUTPUT_CSV, index=False)
+    else:
+        df.to_csv(OUTPUT_CSV, mode='a', header=False, index=False)
+# ==========================================
+# STATE MANAGEMENT (SESSION STATE)
+# ==========================================
+if 'current_idx' not in st.session_state:
+    st.session_state.current_idx = 0
+# Load data
+df = load_dataset_sample(DATASET_SOURCE)
+# ==========================================
+# MAIN INTERFACE
+# ==========================================
+st.title("FFGen Benchmark: Dataset Quality Audit")
+st.markdown("""
+**Evaluation Protocol:**
+This interface aims to establish an objective quality score for the dataset.
+Please evaluate each Code/Feedback pair according to the strict criteria below.
+""")
+if not df.empty:
+    # Progress bar
+    progress = st.session_state.current_idx / len(df)
+    st.progress(progress, text=f"Progress: {st.session_state.current_idx + 1}/{len(df)}")
+    # Get current entry
+    if st.session_state.current_idx < len(df):
+        row = df.iloc[st.session_state.current_idx]
+        # --- LAYOUT: 2 COLUMNS ---
+        col_data, col_eval = st.columns([1.2, 0.8])
+        # --- LEFT COLUMN: DATA TO EVALUATE ---
+        with col_data:
+            st.subheader("🔍 Dataset Entry")
+            # Context
+            with st.expander("View exercise context", expanded=False):
+                st.markdown(f"**Theme:** {row.get('theme', 'N/A')}")
+                st.markdown(f"**Simulated Error:** {row.get('error_category', 'N/A')}")
+                st.text(row.get('instructions', ''))
+            # Code
+            st.markdown("### Student Code (Input)")
+            st.code(row.get('code', ''), language='c')
+            # Feedback
+            st.markdown("### AI Feedback (Output)")
+            st.info(row.get('feedback', ''))
+        # --- RIGHT COLUMN: EVALUATION GRID ---
+        with col_eval:
+            st.subheader("Audit Grid")
+            with st.form("benchmark_form"):
+                # CRITERION 1: TECHNICAL VALIDITY (Factual)
+                st.markdown("#### 1. Technical Validity")
+                technical_validity = st.radio(
+                    "Does the feedback describe a correct technical reality in C?",
+                    options=["Yes (Correct)", "No (Hallucination/False)", "Debatable (Imprecise)"],
+                    index=None,
+                    help="If the AI mentions a non-existent function or incorrect memory behavior, select NO."
+                )
+                # CRITERION 2: DIAGNOSTIC (Factual)
+                st.markdown("#### 2. Diagnostic Precision")
+                diagnostic_quality = st.radio(
+                    "Does the feedback identify the root cause?",
+                    options=[
+                        "Root Cause (e.g., missing malloc)",
+                        "Symptom Only (e.g., infinite loop/memory leak)",
+                        "Off-topic"
+                    ],
+                    index=None
+                )
+                # CRITERION 3: PEDAGOGY (Epitech Method)
+                st.markdown("#### 3. Pedagogical Compliance")
+                col_c1, col_c2 = st.columns(2)
+                with col_c1:
+                    has_spoiler = st.checkbox("Contains solution (Fixed code/Spoiler)")
+                with col_c2:
+                    is_maieutic = st.checkbox("Asks a question or uses a guiding concept")
+                # CRITERION 4: ACTIONABILITY
+                st.markdown("#### 4. Actionability")
+                actionability = st.select_slider(
+                    "Can a beginner student fix their code using this feedback?",
+                    options=["Impossible", "Difficult", "Doable", "Immediate"]
+                )
+                expert_name = st.text_input("Evaluator Name", value="Expert_1")
+                comment = st.text_area("Specific comment (optional)")
+                # SUBMISSION BUTTONS
+                st.divider()
+                submitted = st.form_submit_button("💾 Save Evaluation", type="primary")
+                if submitted:
+                    if technical_validity is None or diagnostic_quality is None:
+                        st.error("Technical and diagnostic criteria are mandatory.")
+                    else:
+                        # Create data object
+                        eval_entry = {
+                            "dataset_id": row.get('id', st.session_state.current_idx),
+                            "evaluator": expert_name,
+                            "timestamp": datetime.now().isoformat(),
+                            "tech_validity": technical_validity,
+                            "diagnostic_score": diagnostic_quality,
+                            "has_spoiler": has_spoiler,
+                            "is_maieutic": is_maieutic,
+                            "actionability": actionability,
+                            "comment": comment
+                        }
+                        save_evaluation(eval_entry)
+                        st.success("Validated!")
+                        st.session_state.current_idx += 1
+                        st.rerun()
+    else:
+        st.success("Audit complete for this sample!")
+        # Immediate summary dashboard
+        if os.path.exists(OUTPUT_CSV):
+            res_df = pd.read_csv(OUTPUT_CSV)
+            st.divider()
+            st.subheader("Preliminary Results")
+            c1, c2, c3 = st.columns(3)
+            # Calculate validity rate
+            # Note: We check if string contains "Yes" to match the English option
+            valid_count = res_df[res_df['tech_validity'].str.contains("Yes")].shape[0]
+            valid_rate = (valid_count / len(res_df)) * 100
+            c1.metric("Technical Validity", f"{valid_rate:.1f}%")
+            # Calculate spoiler rate
+            spoil_count = res_df[res_df['has_spoiler'] == True].shape[0]
+            spoil_rate = (spoil_count / len(res_df)) * 100
+            c2.metric("Spoiler Rate (Should be low)", f"{spoil_rate:.1f}%", delta_color="inverse")
+            c3.metric("Audited Samples", len(res_df))
+else:
+    st.warning(f"No dataset found at location: `{DATASET_SOURCE}`. Please check the path.")

pages/search.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import streamlit as st
+import torch
+import torch.nn.functional as F
+import numpy as np
+import time
+from pathlib import Path
+from utils import load_css
+# ==========================================
+# PAGE CONFIG
+# ==========================================
+st.set_page_config(page_title="Search & Logs Analysis", page_icon="🔍", layout="wide")
+def load_css(file_name):
+    try:
+        css_file = Path(__file__).parent.parent / file_name
+        with open(css_file) as f:
+            st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
+    except FileNotFoundError: st.error(f"CSS file not found: {file_name}")
+load_css("assets/style_search.css")
+# ==========================================
+# UTILS
+# ==========================================
+def encode_text(text, model, tokenizer):
+    if not text: return None
+    device = next(model.parameters()).device
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+        embeddings = outputs.last_hidden_state.mean(dim=1)
+        embeddings = F.normalize(embeddings, p=2, dim=1)
+    return embeddings[0].cpu().numpy()
+def calculate_cosine_distance(vec_a, vec_b):
+    if vec_a is None or vec_b is None: return 1.0
+    return 1.0 - min(max(np.dot(vec_a, vec_b), -1.0), 1.0)
+# ==========================================
+# UI
+# ==========================================
+if 'db_initialized' not in st.session_state or not st.session_state.db_initialized:
+    st.warning("**System Not Initialized**")
+    st.info("Please go to the **Home** page to load the model and dataset first.")
+    st.stop()
+st.title("Semantic Search & Debugger")
+st.markdown("Debug vector search and verify semantic distance between query and reference code.")
+col_main, col_sidebar = st.columns([3, 1])
+with col_main:
+    query_code = st.text_area("Input Code Snippet", height=200, placeholder="Paste code here...")
+with col_sidebar:
+    st.subheader("Search Params")
+    k = st.slider("Retrieval Count (k)", 1, 20, 5)
+    threshold_diff = st.slider("Divergence Threshold", 0.0, 1.0, 0.25, step=0.01)
+    search_btn = st.button("🚀 Run Analysis", use_container_width=True)
+st.markdown("### System Logs")
+log_container = st.container()
+if search_btn and query_code:
+    start_time = time.time()
+    with log_container:
+        st.markdown('<div class="debug-terminal">', unsafe_allow_html=True)
+        st.text(f"[*] Starting Analysis Pipeline...")
+    try:
+        with st.spinner("Encoding query..."):
+            query_emb = encode_text(query_code, st.session_state.model, st.session_state.tokenizer)
+        with log_container:
+            st.text(f"[+] Query Encoded: Length {len(query_code)}, Norm {np.linalg.norm(query_emb):.4f}")
+        with st.spinner(f"Querying ChromaDB (k={k})..."):
+            results = st.session_state.collection.query(
+                query_embeddings=[query_emb.tolist()], n_results=k, include=['documents', 'metadatas', 'distances']
+            )
+        st.divider()
+        st.subheader("Analysis Results")
+        valid_results = 0
+        for i in range(len(results['documents'][0])):
+            doc_id = results['ids'][0][i]
+            feedback = results['documents'][0][i]
+            ref_code = results['metadatas'][0][i].get('code', '')
+            ref_code_emb = encode_text(ref_code, st.session_state.model, st.session_state.tokenizer)
+            code_sem_dist = calculate_cosine_distance(query_emb, ref_code_emb)
+            status_text = "REJECTED"
+            status_class = "badge-error"
+            if code_sem_dist < 0.05:
+                status_class = "badge-success"; status_text = "EXACT MATCH"
+            elif code_sem_dist < threshold_diff:
+                status_class = "badge-warning"; status_text = "SEMANTIC MATCH"
+            with log_container:
+                st.text(f"    [Candidate #{i+1} - {doc_id}]")
+                st.text(f"    > Code Dist : {code_sem_dist:.4f} | Status: {status_text}")
+            if code_sem_dist < threshold_diff:
+                valid_results += 1
+                with st.expander(f"Result #{i+1} - {status_text} (Dist: {code_sem_dist:.3f})", expanded=True):
+                    st.markdown(f"""
+                    <div class="result-card">
+                        <div style="display:flex; justify-content:space-between; margin-bottom:10px;">
+                            <span class="badge {status_class}">{status_text}</span>
+                            <small style="color:#64748b">ID: {doc_id}</small>
+                        </div>
+                        <p style="font-size:1.05rem; line-height:1.6;">{feedback}</p>
+                    </div>""", unsafe_allow_html=True)
+                    c1, c2 = st.columns(2)
+                    with c1: st.caption("Your Code"); st.code(query_code, language='c')
+                    with c2: st.caption("DB Code"); st.code(ref_code, language='c')
+        total_time = (time.time() - start_time) * 1000
+        with log_container:
+            st.text(f"[*] Finished in {total_time:.2f}ms. Valid: {valid_results}/{k}")
+            st.markdown('</div>', unsafe_allow_html=True)
+        if valid_results == 0:
+            st.warning("No results found within the semantic threshold.")
+    except Exception as e:
+        st.error(f"Analysis Failed: {str(e)}")

pages/stats.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""
+Statistics Dashboard
+Displays metrics for the cache system
+"""
+import streamlit as st
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from datetime import datetime, timedelta
+import sys
+from pathlib import Path
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from stats_logger import StatsLogger
+# ==========================================
+# PAGE CONFIG
+# ==========================================
+st.set_page_config(
+    page_title="Cache Statistics",
+    layout="wide"
+)
+st.title("Cache Performance Statistics")
+# ==========================================
+# LOAD DATA
+# ==========================================
+logger = StatsLogger()
+# Load data
+stats = logger.read_stats()
+summary = logger.get_summary_stats()
+cache_misses = logger.read_cache_misses()
+if not stats:
+    st.warning("No data yet. Submit some queries first!")
+    st.stop()
+# Convert to DataFrame
+df = pd.DataFrame(stats)
+# Convert timestamp to datetime
+if 'timestamp' in df.columns:
+    df['timestamp'] = pd.to_datetime(df['timestamp'])
+    df = df.sort_values('timestamp')
+# ==========================================
+# KPI METRICS
+# ==========================================
+st.header("Key Performance Indicators")
+col1, col2, col3, col4 = st.columns(4)
+with col1:
+    st.metric(
+        "Total Queries",
+        f"{summary['total_queries']:,}",
+        help="Total number of queries submitted"
+    )
+with col2:
+    st.metric(
+        "Cache Hit Rate",
+        f"{summary['hit_rate']:.1f}%",
+        delta=f"{summary['total_hits']} hits" if summary['total_hits'] > 0 else None,
+        help="Percentage of queries resolved via cache"
+    )
+with col3:
+    st.metric(
+        "Avg Confidence",
+        f"{summary['avg_confidence']:.2f}",
+        help="Average confidence score for cache hits"
+    )
+with col4:
+    st.metric(
+        "DeepSeek Tokens",
+        f"{summary['total_deepseek_tokens']:,}",
+        delta=f"{summary['total_misses']} calls",
+        delta_color="inverse",
+        help="Total tokens consumed via DeepSeek API"
+    )
+st.divider()
+# ==========================================
+# TIME SERIES
+# ==========================================
+st.header("Query Timeline")
+col1, col2 = st.columns(2)
+with col1:
+    # Hit/Miss over time
+    fig = px.scatter(
+        df,
+        x='timestamp',
+        y='confidence',
+        color='status',
+        size='response_time_ms',
+        color_discrete_map={'hit': '#10b981', 'miss': '#ef4444'},
+        title="Cache Hit/Miss Over Time",
+        labels={
+            'timestamp': 'Time',
+            'confidence': 'Confidence Score',
+            'status': 'Status',
+            'response_time_ms': 'Response Time (ms)'
+        }
+    )
+    fig.update_layout(height=400)
+    st.plotly_chart(fig, use_container_width=True)
+with col2:
+    # Response time distribution
+    fig = px.box(
+        df,
+        x='status',
+        y='response_time_ms',
+        color='status',
+        color_discrete_map={'hit': '#10b981', 'miss': '#ef4444'},
+        title="Response Time Distribution",
+        labels={'response_time_ms': 'Response Time (ms)', 'status': 'Cache Status'}
+    )
+    fig.update_layout(height=400)
+    st.plotly_chart(fig, use_container_width=True)
+# ==========================================
+# SIMILARITY SCORES
+# ==========================================
+st.header("Similarity Analysis")
+col1, col2 = st.columns(2)
+with col1:
+    # Similarity distribution
+    if 'similarity_score' in df.columns:
+        fig = px.histogram(
+            df,
+            x='similarity_score',
+            color='status',
+            nbins=30,
+            title="Similarity Score Distribution",
+            labels={'similarity_score': 'Similarity Score (lower = more similar)'},
+            color_discrete_map={'hit': '#10b981', 'miss': '#ef4444'}
+        )
+        fig.add_vline(x=0.3, line_dash="dash", line_color="orange",
+                     annotation_text="Threshold (0.3)")
+        fig.update_layout(height=400)
+        st.plotly_chart(fig, use_container_width=True)
+with col2:
+    # Confidence vs Similarity
+    hits_df = df[df['status'] == 'hit']
+    if not hits_df.empty and 'similarity_score' in hits_df.columns:
+        fig = px.scatter(
+            hits_df,
+            x='similarity_score',
+            y='confidence',
+            size='response_time_ms',
+            title="Confidence vs Similarity (Hits Only)",
+            labels={
+                'similarity_score': 'Similarity Score',
+                'confidence': 'Confidence',
+                'response_time_ms': 'Response Time (ms)'
+            },
+            color='confidence',
+            color_continuous_scale='viridis'
+        )
+        fig.update_layout(height=400)
+        st.plotly_chart(fig, use_container_width=True)
+# ==========================================
+# ERROR CATEGORIES
+# ==========================================
+st.header("Error Categories Analysis")
+col1, col2 = st.columns(2)
+with col1:
+    # Top error categories
+    if 'error_category' in df.columns:
+        error_counts = df['error_category'].value_counts().head(10)
+        fig = px.bar(
+            x=error_counts.values,
+            y=error_counts.index,
+            orientation='h',
+            title="Top 10 Error Categories",
+            labels={'x': 'Count', 'y': 'Error Category'},
+            color=error_counts.values,
+            color_continuous_scale='blues'
+        )
+        fig.update_layout(height=400, showlegend=False)
+        st.plotly_chart(fig, use_container_width=True)
+with col2:
+    # Difficulty distribution
+    if 'difficulty' in df.columns:
+        diff_counts = df['difficulty'].value_counts()
+        fig = px.pie(
+            values=diff_counts.values,
+            names=diff_counts.index,
+            title="Difficulty Distribution",
+            color_discrete_sequence=px.colors.sequential.RdBu
+        )
+        fig.update_layout(height=400)
+        st.plotly_chart(fig, use_container_width=True)
+# ==========================================
+# CACHE MISSES LOG
+# ==========================================
+st.header("Recent Cache Misses")
+if cache_misses:
+    st.info(f"{len(cache_misses)} cache misses logged (ready for retraining)")
+    # Display the last 5
+    recent_misses = cache_misses[-5:]
+    for i, miss in enumerate(reversed(recent_misses), 1):
+        with st.expander(f"Miss #{len(cache_misses) - i + 1} - {miss.get('theme', 'N/A')} ({miss.get('error_category', 'N/A')})"):
+            col1, col2 = st.columns([1, 1])
+            with col1:
+                st.markdown("**Code:**")
+                st.code(miss.get('code', 'N/A'), language='c')
+            with col2:
+                st.markdown("**Generated Feedback:**")
+                st.write(miss.get('feedback', 'N/A'))
+            st.markdown(f"**Tokens Used:** {miss.get('tokens_used', 0)}")
+            st.markdown(f"**Timestamp:** {miss.get('timestamp', 'N/A')}")
+else:
+    st.success("No cache misses yet - all queries resolved from cache!")
+# ==========================================
+# EXPORT DATA
+# ==========================================
+st.divider()
+st.header("Export Data")
+col1, col2 = st.columns(2)
+with col1:
+    if st.button("Download Stats CSV"):
+        csv = df.to_csv(index=False)
+        st.download_button(
+            label="Download stats.csv",
+            data=csv,
+            file_name=f"cache_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
+            mime="text/csv"
+        )
+with col2:
+    if cache_misses and st.button("Download Cache Misses JSONL"):
+        import json
+        jsonl_content = "\n".join(json.dumps(miss) for miss in cache_misses)
+        st.download_button(
+            label="Download cache_miss.jsonl",
+            data=jsonl_content,
+            file_name=f"cache_miss_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl",
+            mime="application/jsonl"
+        )

reload_config.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""
+Utility to force reload config without restarting Streamlit
+"""
+import importlib
+import sys
+def reload_config():
+    """Force reload the config module"""
+    if 'config' in sys.modules:
+        import config
+        importlib.reload(config)
+        return True
+    return False
+if __name__ == "__main__":
+    reload_config()
+    print("Config reloaded!")

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
-altair
-pandas
-streamlit

+streamlit==1.29.0
+torch==2.1.0
+transformers==4.36.0
+peft==0.7.1
+datasets==2.15.0
+chromadb==0.4.18

run.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/bin/bash
+# Quick launcher for RAG Feedback System
+cd "$(dirname "$0")"
+echo "🚀 Starting RAG Feedback System..."
+echo ""
+echo "📊 Pages available:"
+echo "   - Main: http://localhost:8501"
+echo "   - Stats: http://localhost:8501/stats"
+echo ""
+echo "Press Ctrl+C to stop"
+echo ""
+streamlit run app.py

stats_logger.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""
+Stats Logger - Enregistre toutes les requêtes pour analyse
+"""
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Any
+from config import STATS_LOG, CACHE_MISS_LOG
+class StatsLogger:
+    def __init__(self):
+        # Créer les dossiers si nécessaire
+        Path(STATS_LOG).parent.mkdir(parents=True, exist_ok=True)
+        Path(CACHE_MISS_LOG).parent.mkdir(parents=True, exist_ok=True)
+    def log_query(self, query_data: Dict[str, Any]) -> None:
+        """
+        Enregistre une requête dans stats.jsonl
+        Args:
+            query_data: {
+                "query_id": str,
+                "timestamp": str,
+                "status": "hit" | "miss",
+                "similarity_score": float,
+                "confidence": float,
+                "response_time_ms": float,
+                "theme": str,
+                "error_category": str,
+                "difficulty": str,
+                "deepseek_tokens": int,
+                "cache_size": int
+            }
+        """
+        # Ajouter timestamp si pas présent
+        if 'timestamp' not in query_data:
+            query_data['timestamp'] = datetime.now().isoformat()
+        try:
+            with open(STATS_LOG, 'a', encoding='utf-8') as f:
+                f.write(json.dumps(query_data, ensure_ascii=False) + '\n')
+        except Exception as e:
+            print(f"Error logging stats: {e}")
+    def log_cache_miss(self, miss_data: Dict[str, Any]) -> None:
+        """
+        Enregistre un cache miss avec toutes les données (format dataset).
+        Args:
+            miss_data: {
+                "theme": str,
+                "difficulty": str,
+                "tags": list,
+                "error_category": str,
+                "instructions": str,
+                "code": str,
+                "test_cases_scope": list,
+                "failed_tests": list,
+                "feedback": str,
+                "query_id": str,
+                "timestamp": str,
+                "tokens_used": int
+            }
+        """
+        # Ajouter timestamp
+        if 'timestamp' not in miss_data:
+            miss_data['timestamp'] = datetime.now().isoformat()
+        try:
+            with open(CACHE_MISS_LOG, 'a', encoding='utf-8') as f:
+                f.write(json.dumps(miss_data, ensure_ascii=False) + '\n')
+        except Exception as e:
+            print(f"Error logging cache miss: {e}")
+    def read_stats(self, limit: int = None) -> list:
+        """
+        Lit les stats depuis le fichier.
+        Args:
+            limit: Nombre max de lignes à retourner (None = toutes)
+        Returns:
+            Liste de dicts
+        """
+        stats = []
+        try:
+            with open(STATS_LOG, 'r', encoding='utf-8') as f:
+                for line in f:
+                    if line.strip():
+                        stats.append(json.loads(line))
+            if limit:
+                return stats[-limit:]
+            return stats
+        except FileNotFoundError:
+            return []
+        except Exception as e:
+            print(f"Error reading stats: {e}")
+            return []
+    def read_cache_misses(self, limit: int = None) -> list:
+        """
+        Lit les cache misses depuis le fichier.
+        Args:
+            limit: Nombre max de lignes à retourner (None = toutes)
+        Returns:
+            Liste de dicts (format dataset)
+        """
+        misses = []
+        try:
+            with open(CACHE_MISS_LOG, 'r', encoding='utf-8') as f:
+                for line in f:
+                    if line.strip():
+                        misses.append(json.loads(line))
+            if limit:
+                return misses[-limit:]
+            return misses
+        except FileNotFoundError:
+            return []
+        except Exception as e:
+            print(f"Error reading cache misses: {e}")
+            return []
+    def get_summary_stats(self) -> Dict[str, Any]:
+        """
+        Retourne un résumé des statistiques.
+        Returns:
+            {
+                "total_queries": int,
+                "total_hits": int,
+                "total_misses": int,
+                "hit_rate": float,
+                "avg_confidence": float,
+                "total_deepseek_tokens": int,
+                "avg_response_time_ms": float
+            }
+        """
+        stats = self.read_stats()
+        if not stats:
+            return {
+                "total_queries": 0,
+                "total_hits": 0,
+                "total_misses": 0,
+                "hit_rate": 0.0,
+                "avg_confidence": 0.0,
+                "total_deepseek_tokens": 0,
+                "avg_response_time_ms": 0.0
+            }
+        total_queries = len(stats)
+        total_hits = sum(1 for s in stats if s.get('status') == 'hit')
+        total_misses = total_queries - total_hits
+        hit_rate = (total_hits / total_queries) * 100 if total_queries > 0 else 0.0
+        confidences = [s.get('confidence', 0) for s in stats if s.get('confidence') is not None]
+        avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+        total_tokens = sum(s.get('deepseek_tokens', 0) for s in stats)
+        response_times = [s.get('response_time_ms', 0) for s in stats if s.get('response_time_ms')]
+        avg_response_time = sum(response_times) / len(response_times) if response_times else 0.0
+        return {
+            "total_queries": total_queries,
+            "total_hits": total_hits,
+            "total_misses": total_misses,
+            "hit_rate": round(hit_rate, 2),
+            "avg_confidence": round(avg_confidence, 3),
+            "total_deepseek_tokens": total_tokens,
+            "avg_response_time_ms": round(avg_response_time, 2)
+        }

test_system.py ADDED Viewed

	@@ -0,0 +1,162 @@

+#!/usr/bin/env python3
+"""
+Test script for the cache system
+"""
+import os
+from deepseek_caller import DeepSeekCaller
+from stats_logger import StatsLogger
+from config import DEEPSEEK_API_KEY
+def test_deepseek_connection():
+    """Test if DeepSeek API is accessible"""
+    print("=" * 70)
+    print("🧪 TEST 1: DeepSeek API Connection")
+    print("=" * 70)
+    if not DEEPSEEK_API_KEY:
+        print(" DEEPSEEK_API_KEY not found in environment")
+        print("   Set it with: export DEEPSEEK_API_KEY='your-key'")
+        return False
+    try:
+        caller = DeepSeekCaller()
+        is_connected = caller.test_connection()
+        if is_connected:
+            print(" DeepSeek API is accessible")
+            return True
+        else:
+            print(" DeepSeek API test failed")
+            return False
+    except Exception as e:
+        print(f" Error: {e}")
+        return False
+def test_feedback_generation():
+    """Test feedback generation"""
+    print("\n" + "=" * 70)
+    print("🧪 TEST 2: Feedback Generation")
+    print("=" * 70)
+    caller = DeepSeekCaller()
+    test_context = {
+        "theme": "Prime Number Check",
+        "difficulty": "beginner",
+        "error_category": "Incorrect Base Case Return Value",
+        "instructions": "Write a function to check if a number is prime",
+        "code": """
+int is_prime(int n) {
+    if (n <= 1) {
+        return 1;  // Bug: should return 0
+    }
+    for (int i = 2; i * i <= n; i++) {
+        if (n % i == 0) return 0;
+    }
+    return 1;
+}
+""",
+        "test_cases_scope": ["Input: 1", "Input: 2", "Input: 17"],
+        "failed_tests": ["Input: 1 (returns 1, expected 0)"]
+    }
+    print(" Generating feedback...")
+    result = caller.generate_feedback(test_context)
+    if result.get('feedback'):
+        print(" Feedback generated successfully!")
+        print("\n Metrics:")
+        print(f"   Tokens (prompt): {result['tokens_prompt']}")
+        print(f"   Tokens (completion): {result['tokens_completion']}")
+        print(f"   Total tokens: {result['tokens_total']}")
+        print(f"   Generation time: {result['generation_time_ms']:.0f} ms")
+        print("\n Feedback:")
+        print(f"   {result['feedback'][:200]}...")
+        return True
+    else:
+        print(f" Error: {result.get('error')}")
+        return False
+def test_stats_logger():
+    """Test stats logging"""
+    print("\n" + "=" * 70)
+    print("🧪 TEST 3: Stats Logger")
+    print("=" * 70)
+    logger = StatsLogger()
+    # Test query log
+    test_query = {
+        "query_id": "test-123",
+        "status": "hit",
+        "similarity_score": 0.15,
+        "confidence": 0.95,
+        "response_time_ms": 123.45,
+        "theme": "Test Theme",
+        "error_category": "Test Error",
+        "difficulty": "beginner",
+        "deepseek_tokens": 0,
+        "cache_size": 100
+    }
+    try:
+        logger.log_query(test_query)
+        print(" Query logged successfully")
+        # Read back
+        stats = logger.read_stats(limit=1)
+        if stats:
+            print(f" Read back: {stats[-1]['query_id']}")
+        else:
+            print("  No stats found (empty file)")
+        return True
+    except Exception as e:
+        print(f" Error: {e}")
+        return False
+def main():
+    print(" TESTING CACHE SYSTEM COMPONENTS")
+    print()
+    results = []
+    # Test 1: API Connection
+    results.append(("DeepSeek API", test_deepseek_connection()))
+    # Test 2: Feedback Generation
+    if results[0][1]:  # Only if API works
+        results.append(("Feedback Generation", test_feedback_generation()))
+    else:
+        print("\n⏭  Skipping feedback generation test (API unavailable)")
+    # Test 3: Stats Logger
+    results.append(("Stats Logger", test_stats_logger()))
+    # Summary
+    print("\n" + "=" * 70)
+    print(" TEST SUMMARY")
+    print("=" * 70)
+    for test_name, passed in results:
+        status = " PASS" if passed else " FAIL"
+        print(f"{status:12} {test_name}")
+    total_tests = len(results)
+    passed_tests = sum(1 for _, passed in results if passed)
+    print()
+    print(f"Total: {passed_tests}/{total_tests} tests passed")
+    if passed_tests == total_tests:
+        print(" All tests passed! System is ready.")
+        return 0
+    else:
+        print("  Some tests failed. Check configuration.")
+        return 1
+if __name__ == "__main__":
+    exit(main())

trend_analyzer.py ADDED Viewed

	@@ -0,0 +1,556 @@

+"""
+Trend Analysis Module for RAG Feedback Viewer
+==============================================
+This module analyzes patterns in student code submissions by tracking:
+- Recurring difficulties by author_id and code_id
+- Common error patterns through feedback clustering
+- Topic identification for problematic areas
+"""
+import numpy as np
+import pandas as pd
+from collections import Counter, defaultdict
+from typing import Dict, List, Tuple, Optional
+from sklearn.cluster import KMeans, DBSCAN
+from sklearn.manifold import TSNE
+import plotly.express as px
+import plotly.graph_objects as go
+class TrendAnalyzer:
+    """Analyze trends and patterns in code feedback data"""
+    def __init__(self):
+        self.data_cache = None
+        self.cluster_cache = None
+    def load_data_from_collection(self, collection, dataset: List[Dict]) -> pd.DataFrame:
+        """
+        Load and structure data from ChromaDB collection and original dataset
+        Args:
+            collection: ChromaDB collection
+            dataset: Original dataset with code_id and author_id
+        Returns:
+            DataFrame with all necessary fields
+        """
+        # Get all data from collection
+        results = collection.get(include=['embeddings', 'documents', 'metadatas'])
+        # Build DataFrame
+        data = []
+        for i, (doc_id, embedding, document, metadata) in enumerate(
+            zip(results['ids'], results['embeddings'], results['documents'], results['metadatas'])
+        ):
+            # Extract index from doc_id (format: "doc_123")
+            idx = int(doc_id.split('_')[1])
+            # Get corresponding dataset entry
+            if idx < len(dataset):
+                original_entry = dataset[idx]
+                data.append({
+                    'id': doc_id,
+                    'feedback': document,
+                    'code': metadata.get('code', ''),
+                    'code_id': original_entry.get('code_id', 'unknown'),
+                    'author_id': original_entry.get('author_id', 'unknown'),
+                    'embedding': embedding
+                })
+        df = pd.DataFrame(data)
+        self.data_cache = df
+        return df
+    def analyze_authors_cluster_diversity(self, df_clustered: pd.DataFrame) -> pd.DataFrame:
+        """
+        Analyze how many different clusters each author appears in.
+        Authors in many clusters = struggling with multiple different concepts.
+        Args:
+            df_clustered: DataFrame with cluster labels and author_id
+        Returns:
+            DataFrame with author statistics sorted by cluster diversity
+        """
+        author_stats = df_clustered.groupby('author_id').agg({
+            'cluster': lambda x: list(x),
+            'feedback': 'count',
+            'code_id': lambda x: list(x)
+        }).reset_index()
+        # Calculate cluster diversity metrics
+        author_stats['unique_clusters'] = author_stats['cluster'].apply(lambda x: len(set(x)))
+        author_stats['total_feedbacks'] = author_stats['feedback']
+        author_stats['cluster_diversity_ratio'] = author_stats['unique_clusters'] / author_stats['total_feedbacks']
+        author_stats['cluster_list'] = author_stats['cluster'].apply(lambda x: sorted(set(x)))
+        # Rename columns
+        author_stats = author_stats[['author_id', 'total_feedbacks', 'unique_clusters',
+                                     'cluster_diversity_ratio', 'cluster_list', 'code_id']]
+        author_stats.columns = ['author_id', 'total_feedbacks', 'unique_clusters',
+                               'diversity_ratio', 'clusters', 'code_ids']
+        # Sort by unique clusters (descending) then by total feedbacks
+        author_stats = author_stats.sort_values(['unique_clusters', 'total_feedbacks'],
+                                                ascending=[False, False])
+        return author_stats
+    def find_recurring_authors(self, df: pd.DataFrame, min_submissions: int = 3) -> pd.DataFrame:
+        """
+        Identify authors with multiple submissions
+        Args:
+            df: DataFrame with author_id column
+            min_submissions: Minimum number of submissions to be considered recurring
+        Returns:
+            DataFrame with author statistics
+        """
+        author_stats = df.groupby('author_id').agg({
+            'code_id': 'count',
+            'feedback': lambda x: list(x)
+        }).reset_index()
+        author_stats.columns = ['author_id', 'submission_count', 'feedbacks']
+        author_stats = author_stats[author_stats['submission_count'] >= min_submissions]
+        author_stats = author_stats.sort_values('submission_count', ascending=False)
+        return author_stats
+    def find_recurring_code_patterns(self, df: pd.DataFrame, min_occurrences: int = 2) -> pd.DataFrame:
+        """
+        Identify code_ids that appear multiple times (same exercise attempted multiple times)
+        Args:
+            df: DataFrame with code_id column
+            min_occurrences: Minimum occurrences to be considered recurring
+        Returns:
+            DataFrame with code pattern statistics
+        """
+        code_stats = df.groupby('code_id').agg({
+            'author_id': 'count',
+            'feedback': lambda x: list(x),
+            'code': 'first'
+        }).reset_index()
+        code_stats.columns = ['code_id', 'occurrence_count', 'feedbacks', 'sample_code']
+        code_stats = code_stats[code_stats['occurrence_count'] >= min_occurrences]
+        code_stats = code_stats.sort_values('occurrence_count', ascending=False)
+        return code_stats
+    def find_optimal_clusters(self, embeddings: np.ndarray, max_k: int = 20) -> int:
+        """
+        Find optimal number of clusters using silhouette score
+        Args:
+            embeddings: Embedding matrix
+            max_k: Maximum number of clusters to try
+        Returns:
+            Optimal number of clusters
+        """
+        from sklearn.metrics import silhouette_score
+        # Try different k values
+        min_k = max(2, min(5, len(embeddings) // 10))  # At least 2, typically start at 5
+        max_k = min(max_k, len(embeddings) // 5)  # Don't have too many small clusters
+        best_score = -1
+        best_k = 10  # Default fallback
+        for k in range(min_k, max_k + 1):
+            try:
+                clusterer = KMeans(n_clusters=k, random_state=42, n_init=10)
+                labels = clusterer.fit_predict(embeddings)
+                score = silhouette_score(embeddings, labels, metric='cosine', sample_size=min(1000, len(embeddings)))
+                if score > best_score:
+                    best_score = score
+                    best_k = k
+            except:
+                continue
+        return best_k
+    def cluster_feedbacks(
+        self,
+        df: pd.DataFrame,
+        n_clusters: int = None,
+        method: str = 'kmeans'
+    ) -> Tuple[pd.DataFrame, np.ndarray, int]:
+        """
+        Cluster feedbacks based on embeddings to identify common themes
+        Args:
+            df: DataFrame with embedding column
+            n_clusters: Number of clusters (if None, will find optimal)
+            method: 'kmeans' or 'dbscan'
+        Returns:
+            Tuple of (DataFrame with cluster labels, cluster info, number of clusters used)
+        """
+        embeddings = np.array(df['embedding'].tolist())
+        if method == 'kmeans':
+            # Find optimal if not specified
+            if n_clusters is None:
+                n_clusters = self.find_optimal_clusters(embeddings)
+            clusterer = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+            labels = clusterer.fit_predict(embeddings)
+            cluster_info = clusterer.cluster_centers_
+        else:  # dbscan
+            clusterer = DBSCAN(eps=0.5, min_samples=5, metric='cosine')
+            labels = clusterer.fit_predict(embeddings)
+            cluster_info = labels
+            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
+        df_clustered = df.copy()
+        df_clustered['cluster'] = labels
+        self.cluster_cache = {
+            'labels': labels,
+            'method': method,
+            'cluster_info': cluster_info,
+            'n_clusters': n_clusters
+        }
+        return df_clustered, cluster_info, n_clusters
+    def get_cluster_statistics(self, df_clustered: pd.DataFrame) -> pd.DataFrame:
+        """
+        Get statistics for each cluster
+        Args:
+            df_clustered: DataFrame with cluster labels
+        Returns:
+            DataFrame with cluster statistics
+        """
+        cluster_stats = df_clustered.groupby('cluster').agg({
+            'code_id': 'count',
+            'author_id': lambda x: len(set(x)),
+            'feedback': lambda x: list(x)[:5],  # Top 5 samples
+            'code': lambda x: list(x)[:3]  # Top 3 code samples
+        }).reset_index()
+        cluster_stats.columns = [
+            'cluster_id',
+            'feedback_count',
+            'unique_authors',
+            'sample_feedbacks',
+            'sample_codes'
+        ]
+        cluster_stats = cluster_stats.sort_values('feedback_count', ascending=False)
+        return cluster_stats
+    def find_author_difficulties(
+        self,
+        df_clustered: pd.DataFrame,
+        author_id: str
+    ) -> Dict:
+        """
+        Analyze specific author's difficulty patterns
+        Args:
+            df_clustered: DataFrame with cluster labels
+            author_id: Author ID to analyze
+        Returns:
+            Dictionary with author difficulty analysis
+        """
+        author_data = df_clustered[df_clustered['author_id'] == author_id]
+        if len(author_data) == 0:
+            return {'error': 'Author not found'}
+        # Cluster distribution
+        cluster_dist = author_data['cluster'].value_counts().to_dict()
+        # Most common issues (based on cluster membership)
+        primary_clusters = author_data['cluster'].value_counts().head(3).index.tolist()
+        return {
+            'author_id': author_id,
+            'total_submissions': len(author_data),
+            'cluster_distribution': cluster_dist,
+            'primary_difficulty_clusters': primary_clusters,
+            'feedbacks': author_data['feedback'].tolist(),
+            'codes': author_data['code'].tolist()
+        }
+    def identify_common_topics(
+        self,
+        df_clustered: pd.DataFrame,
+        top_n: int = 10,
+        use_llm: bool = True
+    ) -> List[Dict]:
+        """
+        Identify most common topics/difficulties across all students
+        Args:
+            df_clustered: DataFrame with cluster labels
+            top_n: Number of top topics to return
+            use_llm: Use LLM-based topic extraction (better quality)
+        Returns:
+            List of topic dictionaries
+        """
+        cluster_stats = self.get_cluster_statistics(df_clustered)
+        topics = []
+        for _, row in cluster_stats.head(top_n).iterrows():
+            if use_llm:
+                # Use LLM to extract meaningful concepts
+                concepts = self._extract_concepts_with_llm(row['sample_feedbacks'])
+            else:
+                # Fallback to basic keyword extraction
+                concepts = self._extract_keywords_basic(row['sample_feedbacks'])
+            topics.append({
+                'cluster_id': int(row['cluster_id']),
+                'student_count': int(row['unique_authors']),
+                'occurrence_count': int(row['feedback_count']),
+                'key_concepts': concepts,
+                'sample_feedbacks': row['sample_feedbacks'][:3],
+                'sample_codes': row['sample_codes'][:2]
+            })
+        return topics
+    def _extract_concepts_with_llm(self, feedbacks: List[str]) -> List[str]:
+        """
+        Extract programming concepts using zero-shot classification
+        Args:
+            feedbacks: List of feedback texts
+        Returns:
+            List of key programming concepts
+        """
+        from transformers import pipeline
+        # Combine top feedbacks
+        combined_text = ' '.join(feedbacks[:5])[:1000]  # Limit length
+        # Candidate programming concepts
+        candidate_labels = [
+            "edge cases",
+            "loop conditions",
+            "variable initialization",
+            "return values",
+            "pointer management",
+            "memory allocation",
+            "array indexing",
+            "conditional logic",
+            "recursion",
+            "null handling",
+            "boundary conditions",
+            "type conversion",
+            "function parameters",
+            "error handling",
+            "algorithm efficiency",
+            "data structure usage",
+            "integer overflow",
+            "off-by-one errors",
+            "scope issues",
+            "logic errors"
+        ]
+        try:
+            # Use zero-shot classification
+            classifier = pipeline("zero-shot-classification",
+                                model="facebook/bart-large-mnli",
+                                device=-1)  # CPU
+            result = classifier(combined_text, candidate_labels, multi_label=True)
+            # Get top 5 concepts with score > 0.3
+            concepts = [
+                label for label, score in zip(result['labels'], result['scores'])
+                if score > 0.3
+            ][:5]
+            return concepts if concepts else ["general programming"]
+        except Exception as e:
+            print(f"LLM extraction failed: {e}, falling back to basic")
+            return self._extract_keywords_basic(feedbacks)
+    def _extract_keywords_basic(self, feedbacks: List[str]) -> List[str]:
+        """
+        Basic keyword extraction (fallback)
+        Args:
+            feedbacks: List of feedback texts
+        Returns:
+            List of keywords
+        """
+        all_feedbacks = ' '.join(feedbacks)
+        words = all_feedbacks.lower().split()
+        # Extended stop words - more comprehensive
+        stop_words = {
+            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+            'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
+            'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should',
+            'can', 'could', 'may', 'might', 'your', 'you', 'this', 'that', 'these',
+            'those', 'when', 'where', 'why', 'how', 'which', 'who', 'what',
+            'after', 'before', 'during', 'while', 'about', 'into', 'through',
+            'their', 'there', 'them', 'they', 'then', 'than', 'such', 'some',
+            'it', 'its', 'if', 'else', 'not', 'all', 'any', 'each', 'every'
+        }
+        # Programming-specific keywords to keep
+        prog_keywords = {
+            'loop', 'variable', 'function', 'return', 'pointer', 'array',
+            'condition', 'edge', 'case', 'null', 'memory', 'allocation',
+            'index', 'recursion', 'parameter', 'overflow', 'boundary'
+        }
+        keywords = []
+        for w in words:
+            w_clean = w.strip('.,;:!?()')
+            if (w_clean in prog_keywords) or (w_clean not in stop_words and len(w_clean) > 4):
+                keywords.append(w_clean)
+        keyword_counts = Counter(keywords).most_common(5)
+        return [kw for kw, _ in keyword_counts]
+    def visualize_clusters_2d(
+        self,
+        df_clustered: pd.DataFrame,
+        sample_size: int = 1000
+    ) -> go.Figure:
+        """
+        Create 2D visualization of feedback clusters using t-SNE
+        Args:
+            df_clustered: DataFrame with embeddings and cluster labels
+            sample_size: Max number of points to visualize (for performance)
+        Returns:
+            Plotly figure
+        """
+        # Sample data if too large
+        if len(df_clustered) > sample_size:
+            df_sample = df_clustered.sample(n=sample_size, random_state=42)
+        else:
+            df_sample = df_clustered
+        # Extract embeddings
+        embeddings = np.array(df_sample['embedding'].tolist())
+        # Reduce to 2D using t-SNE
+        tsne = TSNE(n_components=2, random_state=42, perplexity=30)
+        embeddings_2d = tsne.fit_transform(embeddings)
+        # Create visualization DataFrame with full feedback
+        viz_df = pd.DataFrame({
+            'x': embeddings_2d[:, 0],
+            'y': embeddings_2d[:, 1],
+            'cluster': df_sample['cluster'].astype(str),
+            'feedback': df_sample['feedback'].tolist(),  # Full feedback
+            'author_id': df_sample['author_id'].tolist(),  # Full author_id
+            'code_id': df_sample['code_id'].tolist()
+        })
+        # Create scatter plot with custom hover template
+        fig = go.Figure()
+        # Add trace for each cluster
+        for cluster_id in sorted(viz_df['cluster'].unique()):
+            cluster_data = viz_df[viz_df['cluster'] == cluster_id]
+            fig.add_trace(go.Scatter(
+                x=cluster_data['x'],
+                y=cluster_data['y'],
+                mode='markers',
+                name=f'Cluster {cluster_id}',
+                marker=dict(size=8, opacity=0.7),
+                customdata=np.column_stack((
+                    cluster_data['feedback'],
+                    cluster_data['author_id'],
+                    cluster_data['code_id']
+                )),
+                hovertemplate='<b>Cluster %{fullData.name}</b><br><br>' +
+                             '<b>Feedback:</b><br>%{customdata[0]}<br><br>' +
+                             '<b>Author ID:</b> %{customdata[1]}<br>' +
+                             '<b>Code ID:</b> %{customdata[2]}<br>' +
+                             '<extra></extra>'
+            ))
+        fig.update_layout(
+            title='Feedback Clusters - 2D Projection (t-SNE)',
+            xaxis_title='',  # Remove dimension labels
+            yaxis_title='',
+            xaxis=dict(showticklabels=False),  # Hide axis ticks
+            yaxis=dict(showticklabels=False),
+            height=600,
+            hovermode='closest',
+            legend_title='Cluster'
+        )
+        return fig
+    def get_difficulty_heatmap(
+        self,
+        df_clustered: pd.DataFrame,
+        top_n_authors: int = 20,
+        top_n_clusters: int = 10
+    ) -> go.Figure:
+        """
+        Create heatmap showing which authors struggle with which topics
+        Args:
+            df_clustered: DataFrame with cluster labels and author_id
+            top_n_authors: Number of top authors to show
+            top_n_clusters: Number of top clusters to show
+        Returns:
+            Plotly figure
+        """
+        # Get top authors by submission count
+        top_authors = df_clustered['author_id'].value_counts().head(top_n_authors).index
+        # Get top clusters by frequency
+        top_clusters = df_clustered['cluster'].value_counts().head(top_n_clusters).index
+        # Filter data
+        df_filtered = df_clustered[
+            (df_clustered['author_id'].isin(top_authors)) &
+            (df_clustered['cluster'].isin(top_clusters))
+        ]
+        # Create pivot table
+        heatmap_data = df_filtered.groupby(['author_id', 'cluster']).size().reset_index(name='count')
+        heatmap_pivot = heatmap_data.pivot(index='author_id', columns='cluster', values='count').fillna(0)
+        # Create heatmap
+        fig = go.Figure(data=go.Heatmap(
+            z=heatmap_pivot.values,
+            x=[f'Cluster {c}' for c in heatmap_pivot.columns],
+            y=[f'Author {a[:8]}...' for a in heatmap_pivot.index],
+            colorscale='YlOrRd',
+            text=heatmap_pivot.values,
+            texttemplate='%{text}',
+            textfont={"size": 10},
+            colorbar=dict(title="Submission Count")
+        ))
+        fig.update_layout(
+            title='Student Difficulty Patterns - Author vs Topic Clusters',
+            xaxis_title='Difficulty Cluster',
+            yaxis_title='Student ID',
+            height=600
+        )
+        return fig

utils.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import streamlit as st
+def load_css(file_name):
+    with open(file_name) as f:
+        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)