Spaces:

Jaimodiji
/

Report-Generator

Running

File size: 21,478 Bytes

from flask import Blueprint, jsonify, current_app, render_template, request
from flask_login import login_required, current_user
from utils import get_db_connection, sync_neetprep_collection
import os
import time
import json
from processing import resize_image_if_needed, call_nim_ocr_api
from gemini_classifier import classify_questions_with_gemini
from gemma_classifier import GemmaClassifier
from nova_classifier import classify_questions_with_nova
import requests
from nvidia_prompts import BIOLOGY_PROMPT_TEMPLATE, CHEMISTRY_PROMPT_TEMPLATE, PHYSICS_PROMPT_TEMPLATE, MATHEMATICS_PROMPT_TEMPLATE

classifier_bp = Blueprint('classifier_bp', __name__)

# Instantiate classifiers
gemma_classifier = GemmaClassifier()

def get_nvidia_prompt(subject, input_questions):
    if subject.lower() == 'biology':
        return BIOLOGY_PROMPT_TEMPLATE.format(input_questions=input_questions)
    elif subject.lower() == 'chemistry':
        return CHEMISTRY_PROMPT_TEMPLATE.format(input_questions=input_questions)
    elif subject.lower() == 'physics':
        return PHYSICS_PROMPT_TEMPLATE.format(input_questions=input_questions)
    elif subject.lower() == 'mathematics':
        return MATHEMATICS_PROMPT_TEMPLATE.format(input_questions=input_questions)
    return None

@classifier_bp.route('/get_topic_suggestions', methods=['POST'])
@login_required
def get_topic_suggestions():
    data = request.json
    question_text = data.get('question_text')
    image_id = data.get('image_id')
    subject = data.get('subject')
    
    if not subject:
        return jsonify({'error': 'Subject is required'}), 400
        
    # If text is missing but we have image_id, try to get from DB or run OCR
    if not question_text and image_id:
        try:
            conn = get_db_connection()
            # Check DB first
            row = conn.execute('SELECT question_text, processed_filename, i.session_id FROM questions q JOIN images i ON q.image_id = i.id WHERE i.id = ?', (image_id,)).fetchone()
            
            if row:
                if row['question_text']:
                    question_text = row['question_text']
                else:
                    # Run OCR
                    processed_filename = row['processed_filename']
                    session_id = row['session_id']
                    if processed_filename:
                        image_path = os.path.join(current_app.config['PROCESSED_FOLDER'], processed_filename)
                        if os.path.exists(image_path):
                            image_bytes = resize_image_if_needed(image_path)
                            ocr_result = call_nim_ocr_api(image_bytes)
                            
                            if ocr_result.get('data') and ocr_result['data'][0].get('text_detections'):
                                question_text = " ".join(item['text_prediction']['text'] for item in ocr_result['data'][0]['text_detections'])
                                # Save back to DB
                                conn.execute('UPDATE questions SET question_text = ? WHERE image_id = ?', (question_text, image_id))
                                conn.commit()
            conn.close()
        except Exception as e:
            current_app.logger.error(f"Error fetching/OCRing text for image {image_id}: {e}")
            return jsonify({'error': f"OCR failed: {str(e)}"}), 500

    if not question_text:
        return jsonify({'error': 'Could not obtain question text (OCR failed or no text found).'}), 400

    # Prepare prompt
    # The prompt expects "Input Questions: [Insert ...]". 
    # We will format the single question as "1. {text}" to match the pattern somewhat, 
    # though the prompt handles raw text too.
    input_formatted = f"1. {question_text}"
    prompt_content = get_nvidia_prompt(subject, input_formatted)
    
    if not prompt_content:
        return jsonify({'error': f'Unsupported subject: {subject}'}), 400

    # Call NVIDIA API
    nvidia_api_key = os.environ.get('NVIDIA_API_KEY')
    if not nvidia_api_key:
         return jsonify({'error': 'NVIDIA_API_KEY not set'}), 500

    invoke_url = 'https://integrate.api.nvidia.com/v1/chat/completions'
    headers = {
        'Authorization': f'Bearer {nvidia_api_key}',
        'Accept': 'application/json',
        'Content-Type': 'application/json'
    }
    
    payload = {
        "model": "nvidia/nemotron-3-nano-30b-a3b",
        "messages": [
            {
                "content": prompt_content,
                "role": "user"
            }
        ],
        "temperature": 0.2, # Slightly higher for variety in top-k if supported, but here we just want accurate multiple suggestions
        "top_p": 1,
        "max_tokens": 1024,
        "stream": False
    }

    try:
        response = requests.post(invoke_url, headers=headers, json=payload, timeout=30)
        response.raise_for_status()
        result = response.json()
        
        content = result['choices'][0]['message']['content']
        
        # Parse JSON from content (it might be wrapped in markdown code blocks)
        if "```json" in content:
            content = content.split("```json")[1].split("```")[0].strip()
        elif "```" in content:
             content = content.split("```")[1].split("```")[0].strip()
             
        data = json.loads(content)
        
        # Extract suggestions
        suggestions = []
        if data.get('data') and len(data['data']) > 0:
            primary_chapter = data['data'][0].get('chapter_title', 'Unclassified')
            suggestions.append(primary_chapter)
            
            # Check for alternative suggestions if the model provides them (we will update prompts to support this)
            if 'other_possible_chapters' in data['data'][0]:
                others = data['data'][0]['other_possible_chapters']
                if isinstance(others, list):
                    suggestions.extend(others)
            
        return jsonify({'success': True, 'suggestions': suggestions, 'full_response': data})

    except Exception as e:
        current_app.logger.error(f"NVIDIA API Error: {e}")
        return jsonify({'error': str(e)}), 500

@classifier_bp.route('/classified/update_single', methods=['POST'])
@login_required
def update_question_classification_single():
    data = request.json
    image_id = data.get('image_id')
    subject = data.get('subject')
    chapter = data.get('chapter')

    if not image_id:
        return jsonify({'error': 'Image ID is required'}), 400

    try:
        conn = get_db_connection()
        # Security: Check ownership via session -> images
        image_owner = conn.execute("""
            SELECT s.user_id 
            FROM images i 
            JOIN sessions s ON i.session_id = s.id 
            WHERE i.id = ?
        """, (image_id,)).fetchone()

        if not image_owner or image_owner['user_id'] != current_user.id:
            conn.close()
            return jsonify({'error': 'Unauthorized'}), 403

        conn.execute(
            'UPDATE questions SET subject = ?, chapter = ? WHERE image_id = ?',
            (subject, chapter, image_id)
        )
        
        # Auto-convert and sync neetprep collection
        img_row = conn.execute('SELECT session_id FROM images WHERE id = ?', (image_id,)).fetchone()
        if img_row:
            sync_neetprep_collection(conn, img_row['session_id'], current_user.id)
            
        conn.commit()
        conn.close()
        return jsonify({'success': True})
    except Exception as e:
        current_app.logger.error(f"Error updating question classification for image {image_id}: {e}")
        return jsonify({'error': str(e)}), 500



@classifier_bp.route('/classified/edit')
@login_required
def edit_classified_questions():
    """Renders the page for editing classified questions."""
    conn = get_db_connection()

    AVAILABLE_SUBJECTS = ["Biology", "Chemistry", "Physics", "Mathematics"]
    
    # Security: Fetch questions belonging to the current user
    questions_from_db = conn.execute("""
        SELECT q.id, q.question_text, q.chapter, q.subject, q.tags 
        FROM questions q
        JOIN sessions s ON q.session_id = s.id
        WHERE s.user_id = ? AND q.subject IS NOT NULL AND q.chapter IS NOT NULL 
        ORDER BY q.id
    """, (current_user.id,)).fetchall()
    
    questions = []
    for q in questions_from_db:
        q_dict = dict(q)
        plain_text = q_dict.get('question_text') or ''  # Handle None
        q_dict['question_text_plain'] = (plain_text[:100] + '...') if len(plain_text) > 100 else plain_text
        questions.append(q_dict)

    # Suggestions should also be user-specific
    chapters = conn.execute('SELECT DISTINCT q.chapter FROM questions q JOIN sessions s ON q.session_id = s.id WHERE s.user_id = ? AND q.chapter IS NOT NULL ORDER BY q.chapter', (current_user.id,)).fetchall()
    tags_query = conn.execute('SELECT DISTINCT q.tags FROM questions q JOIN sessions s ON q.session_id = s.id WHERE s.user_id = ? AND q.tags IS NOT NULL AND q.tags != \'\'', (current_user.id,)).fetchall()
    all_tags = set()
    for row in tags_query:
        tags = [tag.strip() for tag in row['tags'].split(',')]
        all_tags.update(tags)

    conn.close()
    return render_template('classified_edit.html', 
                           questions=questions, 
                           chapters=[c['chapter'] for c in chapters], 
                           all_tags=sorted(list(all_tags)),
                           available_subjects=AVAILABLE_SUBJECTS)

@classifier_bp.route('/classified/update_question/<int:question_id>', methods=['POST'])
@login_required
def update_classified_question(question_id):
    """Handles updating a question's metadata."""
    data = request.json
    new_chapter = data.get('chapter')
    new_subject = data.get('subject')

    if not new_chapter or not new_subject:
        return jsonify({'error': 'Chapter and Subject cannot be empty.'}), 400

    try:
        conn = get_db_connection()
        # Security: Check ownership before update
        question_owner = conn.execute("SELECT s.user_id FROM questions q JOIN sessions s ON q.session_id = s.id WHERE q.id = ?", (question_id,)).fetchone()
        if not question_owner or question_owner['user_id'] != current_user.id:
            conn.close()
            return jsonify({'error': 'Unauthorized'}), 403

        conn.execute(
            'UPDATE questions SET chapter = ?, subject = ? WHERE id = ?',
            (new_chapter, new_subject, question_id)
        )
        conn.commit()
        conn.close()
        return jsonify({'success': True})
    except Exception as e:
        current_app.logger.error(f"Error updating question {question_id}: {repr(e)}")
        return jsonify({'error': str(e)}), 500

@classifier_bp.route('/classified/delete_question/<int:question_id>', methods=['DELETE'])
@login_required
def delete_classified_question(question_id):
    """Handles deleting a classified question."""
    try:
        conn = get_db_connection()
        # Security: Check ownership before delete
        question_owner = conn.execute("SELECT s.user_id FROM questions q JOIN sessions s ON q.session_id = s.id WHERE q.id = ?", (question_id,)).fetchone()
        if not question_owner or question_owner['user_id'] != current_user.id:
            conn.close()
            return jsonify({'error': 'Unauthorized'}), 403

        # Update the question to remove classification
        conn.execute('UPDATE questions SET subject = NULL, chapter = NULL WHERE id = ?', (question_id,))
        
        # Remove bookmark too
        conn.execute('DELETE FROM neetprep_bookmarks WHERE neetprep_question_id = ? AND question_type = ?', (str(question_id), 'classified'))
        
        conn.commit()
        conn.close()
        return jsonify({'success': True})
    except Exception as e:
        current_app.logger.error(f"Error deleting question {question_id}: {repr(e)}")
        return jsonify({'error': str(e)}), 500

@classifier_bp.route('/classified/delete_many', methods=['POST'])
@login_required
def delete_many_classified_questions():
    """Handles bulk deleting classified questions."""
    data = request.json
    question_ids = data.get('ids', [])

    if not question_ids:
        return jsonify({'error': 'No question IDs provided.'}), 400

    try:
        conn = get_db_connection()
        # Security: Filter IDs to only those owned by the user
        placeholders = ','.join('?' for _ in question_ids)
        owned_q_ids_rows = conn.execute(f"""
            SELECT q.id FROM questions q
            JOIN sessions s ON q.session_id = s.id
            WHERE q.id IN ({placeholders}) AND s.user_id = ?
        """, (*question_ids, current_user.id)).fetchall()
        
        owned_q_ids = [row['id'] for row in owned_q_ids_rows]

        if not owned_q_ids:
            conn.close()
            return jsonify({'success': True, 'message': 'No owned questions to delete.'})

        update_placeholders = ','.join('?' for _ in owned_q_ids)
        conn.execute(f'UPDATE questions SET subject = NULL, chapter = NULL WHERE id IN ({update_placeholders})', owned_q_ids)
        
        # Remove bookmarks too
        owned_ids_str = [str(qid) for qid in owned_q_ids]
        conn.execute(f"DELETE FROM neetprep_bookmarks WHERE neetprep_question_id IN ({update_placeholders}) AND question_type = 'classified'", owned_ids_str)
        
        conn.commit()
        conn.close()
        return jsonify({'success': True})
    except Exception as e:
        current_app.logger.error(f"Error deleting questions: {repr(e)}")
        return jsonify({'error': str(e)}), 500

from rich.table import Table
from rich.console import Console

@classifier_bp.route('/extract_and_classify_all/<session_id>', methods=['POST'])
@login_required
def extract_and_classify_all(session_id):
    try:
        conn = get_db_connection()
        # Security: Check ownership of the session
        session_owner = conn.execute('SELECT user_id FROM sessions WHERE id = ?', (session_id,)).fetchone()
        if not session_owner or session_owner['user_id'] != current_user.id:
            conn.close()
            return jsonify({'error': 'Unauthorized'}), 403

        images = conn.execute(
            "SELECT id, processed_filename FROM images WHERE session_id = ? AND image_type = 'cropped' ORDER BY id", 
            (session_id,)
        ).fetchall()
        
        if not images:
            conn.close()
            return jsonify({'error': 'No cropped images found in session'}), 404

        current_app.logger.info(f"Found {len(images)} images to process for user {current_user.id}.")

        question_texts = []
        image_ids = []
        for image in images:
            image_id = image['id']
            processed_filename = image['processed_filename']
            
            if not processed_filename:
                continue
            
            image_path = os.path.join(current_app.config['PROCESSED_FOLDER'], processed_filename)
            if not os.path.exists(image_path):
                continue
            
            image_bytes = resize_image_if_needed(image_path)
            ocr_result = call_nim_ocr_api(image_bytes)
            
            current_app.logger.info(f"NVIDIA OCR Result for image {image_id}: {ocr_result}")

            if not ocr_result.get('data') or not ocr_result['data'][0].get('text_detections'):
                current_app.logger.error(f"NVIDIA OCR result for image {image_id} does not contain 'text_detections' key. Full response: {ocr_result}")
                continue

            text = " ".join(item['text_prediction']['text'] for item in ocr_result['data'][0]['text_detections'])
            
            conn.execute('UPDATE questions SET question_text = ? WHERE image_id = ?', (text, image_id))
            current_app.logger.info(f"Updated question_text for image_id: {image_id}")
            question_texts.append(text)
            image_ids.append(image_id)

        conn.commit()

        # --- Batch Processing and Classification ---
        batch_size = 7 # Default batch size
        total_questions = len(question_texts)
        num_batches = (total_questions + batch_size - 1) // batch_size
        total_update_count = 0

        for i in range(num_batches):
            start_index = i * batch_size
            end_index = start_index + batch_size
            
            batch_texts = question_texts[start_index:end_index]
            batch_image_ids = image_ids[start_index:end_index]

            if not batch_texts:
                continue

            current_app.logger.info(f"Processing Batch {i+1}/{num_batches}...")

            # Choose classifier based on user preference
            classifier_model = getattr(current_user, 'classifier_model', 'gemini')
            
            if classifier_model == 'nova':
                current_app.logger.info(f"Using Nova classifier for user {current_user.id}")
                classification_result = classify_questions_with_nova(batch_texts, start_index=start_index)
                model_name = "Nova"
            elif classifier_model == 'gemma':
                current_app.logger.info(f"Using Gemma classifier for user {current_user.id}")
                classification_result = gemma_classifier.classify(batch_texts, start_index=start_index)
                model_name = "Gemma"
            else:
                current_app.logger.info(f"Using Gemini classifier for user {current_user.id}")
                classification_result = classify_questions_with_gemini(batch_texts, start_index=start_index)
                model_name = "Gemini"
            
            # Log the result to the terminal
            current_app.logger.info(f"--- Classification Result ({model_name}) for Batch {i+1} ---")
            current_app.logger.info(json.dumps(classification_result, indent=2))
            current_app.logger.info("---------------------------------------------")

            if not classification_result or not classification_result.get('data'):
                current_app.logger.error(f'{model_name} classifier did not return valid data for batch {i+1}.')
                continue # Move to the next batch

            # --- Immediate DB Update for the Batch ---
            batch_update_count = 0
            for item in classification_result.get('data', []):
                item_index_global = item.get('index') # This is the global index (e.g., 1 to 14)
                if item_index_global is not None:
                    # Find the corresponding local index in our full list
                    try:
                        # The item_index_global is 1-based, our list is 0-based
                        local_list_index = item_index_global - 1
                        # Find the image_id for that question
                        matched_id = image_ids[local_list_index]
                    except IndexError:
                        current_app.logger.error(f"Classifier returned an out-of-bounds index: {item_index_global}")
                        continue

                    new_subject = item.get('subject')
                    new_chapter = item.get('chapter_title')

                    # Ensure new_subject and new_chapter are strings, not lists
                    if isinstance(new_subject, list):
                        new_subject = ', '.join(str(x) for x in new_subject) if new_subject else 'Unclassified'
                    elif new_subject is None:
                        new_subject = 'Unclassified'
                    else:
                        new_subject = str(new_subject)

                    if isinstance(new_chapter, list):
                        new_chapter = ', '.join(str(x) for x in new_chapter) if new_chapter else 'Unclassified'
                    elif new_chapter is None:
                        new_chapter = 'Unclassified'
                    else:
                        new_chapter = str(new_chapter)

                    if new_subject and new_subject != 'Unclassified' and new_chapter and new_chapter != 'Unclassified':
                        conn.execute('UPDATE questions SET subject = ?, chapter = ? WHERE image_id = ?', (new_subject, new_chapter, matched_id))
                        batch_update_count += 1
                    elif new_subject and new_subject != 'Unclassified':
                        conn.execute('UPDATE questions SET subject = ?, chapter = ? WHERE image_id = ?', (new_subject, 'Unclassified', matched_id))
                        batch_update_count += 1

            conn.commit()
            total_update_count += batch_update_count
            current_app.logger.info(f"Batch {i+1} processed. Updated {batch_update_count} questions in the database.")

            if i < num_batches - 1:
                current_app.logger.info("Waiting 5 seconds before next batch...")
                time.sleep(5)

        # Auto-convert and sync neetprep collection
        sync_neetprep_collection(conn, session_id, current_user.id)

        conn.commit()
        conn.close()

        return jsonify({'success': True, 'message': f'Successfully extracted and classified {total_questions} questions. Updated {total_update_count} entries in the database.'})

    except Exception as e:
        current_app.logger.error(f'Failed to extract and classify questions: {str(e)}', exc_info=True)
        return jsonify({'error': f'Failed to extract and classify questions: {str(e)}'}), 500