Spaces:

pykara
/

py-learn-backend

Runtime error

App Files Files Community

Oviya commited on Jan 23

Commit

59f2028

1 Parent(s): c3ad823

fix

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env +20 -5
apt.txt +0 -6
chroma_db/1ceaf3a3-30e6-42c4-b515-99a05466da04/header.bin → assets/chroma_db/09c5ed20-106f-41c5-94dc-89d203beb195/data_level0.bin +2 -2
{chroma_db/44944ef3-9b61-4c1b-bc5e-6a49750c0c54 → assets/chroma_db/09c5ed20-106f-41c5-94dc-89d203beb195}/header.bin +1 -1
{chroma_db/1ceaf3a3-30e6-42c4-b515-99a05466da04 → assets/chroma_db/09c5ed20-106f-41c5-94dc-89d203beb195}/length.bin +2 -2
{chroma_db/1ceaf3a3-30e6-42c4-b515-99a05466da04 → assets/chroma_db/09c5ed20-106f-41c5-94dc-89d203beb195}/link_lists.bin +0 -0
feedback.mp4 → assets/feedback.mp4 +0 -0
{pdfs → assets/pdfs}/high/high.pdf +0 -0
{pdfs → assets/pdfs}/low/low.pdf +0 -0
{pdfs → assets/pdfs}/mid/mid.pdf +0 -0
chroma_db/44944ef3-9b61-4c1b-bc5e-6a49750c0c54/data_level0.bin → assets/teacher.png +2 -2
teacher_feedback_sentences_category.json → assets/teacher_feedback_sentences_category.json +0 -0
static/references/voice1.wav → assets/teachervoice.wav +0 -0
auth/__init__.py +25 -0
auth/database.py +168 -0
auth/models.py +177 -0
auth/routes.py +346 -0
auth/utils.py +156 -0
build_chroma_db.py +91 -0
chat.py +0 -246
chroma_db/44944ef3-9b61-4c1b-bc5e-6a49750c0c54/length.bin +0 -3
chroma_db/44944ef3-9b61-4c1b-bc5e-6a49750c0c54/link_lists.bin +0 -0
findingword.py +0 -276
generateQuestion.py +0 -535
googlecredentails.json +0 -13
listen.py +0 -436
media/audio/explain_1112505a6701429cb241d131a88bf709.wav +0 -3
media/audio/explain_5c2a7427d1f14a2aa9fa9e59bb1ad603.wav +0 -3
media/audio/explain_975ae1b5996743f6b76b5016f17056de.wav +0 -3
media/audio/explain_ca92720c882d4926973973aa4b9f2316.wav +0 -3
media/audio/explain_cc24a21b0b374e50bc8afbf73a7398c4.wav +0 -3
media/audio/explain_dd70fb52325d44fc84cde7c1c9215232.wav +0 -3
media/audio/synth_22ebf1e3b9404b34a41b2fdc2c691adb.wav +0 -3
media/audio/synth_2757240115da4ba3a9aa1286aee57db9.wav +0 -3
media/audio/synth_4965badeb7da43ffac0c3a7af781ab0f.wav +0 -3
media/audio/synth_7bccf943f0b24880b77aa038b38f8bf1.wav +0 -3
chroma_db/1ceaf3a3-30e6-42c4-b515-99a05466da04/data_level0.bin → media/audio/synth_d38b265fcd6d4f9cbb825007c3f52ac5.wav +2 -2
media/audio/synth_ee1e3e992d6641b9a06d214e0e67ea92.wav +0 -3
pdfs/testing.pdf +0 -3
pron.py +0 -729
pronragg.py +0 -263
pronragupgrade.py → pronunciation.py +180 -371
pronvideo.py +0 -359
ragg/app.py +295 -491
ragg/ingest_all.py +2 -2
ragg/tts.py +1 -1
reading.py +0 -158
start.sh +0 -29
trim/voice1.wav +0 -3
verification.py +164 -504

.env CHANGED Viewed

@@ -4,19 +4,34 @@ DB_DATABASE=AuthenticationDB1
 DB_DRIVER=ODBC Driver 17 for SQL Server   # match the driver installed on your PC
 RUN_INIT_DB=0
 COHERE_API_KEY=iXPfvur9lmAS4Mo91Bdfc6Gujhi3Jdnm6FP2JJqR
-OPENAI_API_KEY=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
-DID_API_KEY=cmFqYWxhc2htaS5uQHB5a2FyYS5uZXQ:J2uPGx3uD4L7UKgHEiMJI
 DID_SOURCE_IMAGE_URL=https://i.ibb.co/Tpq77ZJ/teacher.png
 DID_VOICE_ID=en-US-JennyNeural
 TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
-CHROMA_DIR=C:/Users/DELL/Desktop/Deploymnet/29 oct/py-learn-backend/ragg/chroma
-CHROMA_ROOT=C:/Users/DELL/Desktop/Deploymnet/29 oct/py-learn-backend/ragg/chroma
 EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
 ALLOWED_ORIGINS=http://localhost:4200,http://127.0.0.1:4200
 RAG_INGEST_URL=http://localhost:5000/rag/ingest
 AWS_ACCESS_KEY_ID=AKIA3PWGNRHL7RTV3XRJ
 AWS_SECRET_ACCESS_KEY=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
 AWS_REGION=ap-south-1
 S3_BUCKET=pykara-tts-audio
-S3_PREFIX=audio/

 DB_DRIVER=ODBC Driver 17 for SQL Server   # match the driver installed on your PC
 RUN_INIT_DB=0
+# --- API Keys
 COHERE_API_KEY=iXPfvur9lmAS4Mo91Bdfc6Gujhi3Jdnm6FP2JJqR
+OPENAI_API_KEY=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
+# --- D-ID Configuration
+DID_API_KEY=cmFqYWxhc2htaS5uQHB5a2FyYS5uZXQ:9Moos-oxSY8uNUNGx1o-u
 DID_SOURCE_IMAGE_URL=https://i.ibb.co/Tpq77ZJ/teacher.png
 DID_VOICE_ID=en-US-JennyNeural
+# --- Tesseract OCR
 TESSERACT_CMD=C:\Program Files\Tesseract-OCR\tesseract.exe
+# --- ChromaDB Configuration
+CHROMA_DIR=C:/Viji-Workingfolder/17-1-26/mj-learn-backend/ragg/chroma
+CHROMA_ROOT=C:/Viji-Workingfolder/17-1-26/mj-learn-backend/ragg/chroma
 EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
+# --- CORS and RAG
 ALLOWED_ORIGINS=http://localhost:4200,http://127.0.0.1:4200
 RAG_INGEST_URL=http://localhost:5000/rag/ingest
+# --- AWS S3 Configuration
 AWS_ACCESS_KEY_ID=AKIA3PWGNRHL7RTV3XRJ
 AWS_SECRET_ACCESS_KEY=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
 AWS_REGION=ap-south-1
 S3_BUCKET=pykara-tts-audio
+S3_PREFIX=audio/
+# --- Authentication Secret Key (CRITICAL for JWT tokens)
+SECRET_KEY=96c63da06374c1bde332516f3acbd23c84f35f90d8a6321a25d790a0a451af32

apt.txt DELETED Viewed

@@ -1,6 +0,0 @@
-ffmpeg
-poppler-utils
-tesseract-ocr
-tesseract-ocr-eng
-libsndfile1
-espeak-ng

chroma_db/1ceaf3a3-30e6-42c4-b515-99a05466da04/header.bin → assets/chroma_db/09c5ed20-106f-41c5-94dc-89d203beb195/data_level0.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
-size 100

 version https://git-lfs.github.com/spec/v1
+oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
+size 1676000

{chroma_db/44944ef3-9b61-4c1b-bc5e-6a49750c0c54 → assets/chroma_db/09c5ed20-106f-41c5-94dc-89d203beb195}/header.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
 size 100

 version https://git-lfs.github.com/spec/v1
+oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
 size 100

{chroma_db/1ceaf3a3-30e6-42c4-b515-99a05466da04 → assets/chroma_db/09c5ed20-106f-41c5-94dc-89d203beb195}/length.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7171cf84eb030fe5cb580f57a325f57cceb0aed0e55ea95c81d67d4181e1ed81
-size 400

 version https://git-lfs.github.com/spec/v1
+oid sha256:a9250db95ea158634771707fb36f3fe0d92d810baeb15d1a9b51716f832628c2
+size 4000

{chroma_db/1ceaf3a3-30e6-42c4-b515-99a05466da04 → assets/chroma_db/09c5ed20-106f-41c5-94dc-89d203beb195}/link_lists.bin RENAMED Viewed

File without changes

feedback.mp4 → assets/feedback.mp4 RENAMED Viewed

File without changes

{pdfs → assets/pdfs}/high/high.pdf RENAMED Viewed

File without changes

{pdfs → assets/pdfs}/low/low.pdf RENAMED Viewed

File without changes

{pdfs → assets/pdfs}/mid/mid.pdf RENAMED Viewed

File without changes

chroma_db/44944ef3-9b61-4c1b-bc5e-6a49750c0c54/data_level0.bin → assets/teacher.png RENAMED Viewed

File without changes

teacher_feedback_sentences_category.json → assets/teacher_feedback_sentences_category.json RENAMED Viewed

File without changes

static/references/voice1.wav → assets/teachervoice.wav RENAMED Viewed

File without changes

auth/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""
+Authentication module for MJ Learn Backend
+This module provides:
+- User authentication and authorization
+- JWT token management
+- Database models for user management
+- Security utilities
+"""
+from .models import User, BlacklistedToken, RefreshToken
+from .utils import token_required, anonymize_username
+from .database import get_db_connection, init_db
+from .routes import auth_bp
+__all__ = [
+    'User',
+    'BlacklistedToken',
+    'RefreshToken',
+    'token_required',
+    'anonymize_username',
+    'get_db_connection',
+    'init_db',
+    'auth_bp'
+]

auth/database.py ADDED Viewed

	@@ -0,0 +1,168 @@

+"""
+Database connection and initialization module
+Handles:
+- Database connection management
+- Table creation and initialization
+- Connection string configuration
+- Database diagnostics
+"""
+import os
+import pyodbc
+from threading import Lock
+from .models import get_table_definitions
+# Database configuration
+DB_SERVER = os.getenv("DB_SERVER", r"(localdb)\MSSQLLocalDB")
+DB_DATABASE = os.getenv("DB_DATABASE", "AuthenticationDB1")
+DB_DRIVER = os.getenv("DB_DRIVER", "ODBC Driver 17 for SQL Server")
+# Build connection string
+is_local = (
+    DB_SERVER.lower().startswith("localhost")
+    or DB_SERVER.startswith(".")
+    or DB_SERVER.lower().startswith("(localdb)")
+    or "\\" in DB_SERVER
+)
+if is_local:
+    # Windows local / LocalDB using modern ODBC driver
+    CONN_STR = (
+        f"DRIVER={{{DB_DRIVER}}};"
+        f"SERVER={DB_SERVER};"
+        f"DATABASE={DB_DATABASE};"
+        "Trusted_Connection=yes;"
+        "TrustServerCertificate=yes;"
+    )
+else:
+    # Remote SQL auth
+    CONN_STR = (
+        f"DRIVER={{{DB_DRIVER}}};"
+        f"SERVER={DB_SERVER};DATABASE={DB_DATABASE};"
+        f"UID={os.getenv('DB_USER')};PWD={os.getenv('DB_PASSWORD')};"
+        "Encrypt=yes;TrustServerCertificate=yes;"
+    )
+# Database initialization tracking
+_db_init_done = False
+_db_init_lock = Lock()
+def get_db_connection():
+    """
+    Create a database connection with short timeout
+    Raises:
+        RuntimeError: If DB credentials are missing for remote connections
+        pyodbc.Error: If connection fails
+    """
+    if "Trusted_Connection=yes" not in CONN_STR:
+        if not os.getenv("DB_USER") or not os.getenv("DB_PASSWORD"):
+            raise RuntimeError("DB_USER/DB_PASSWORD are not set in the environment.")
+    return pyodbc.connect(CONN_STR, timeout=5)
+def init_db():
+    """
+    Create database tables if they do not exist
+    Creates:
+    - Users table for authentication
+    - BlacklistedTokens table for token management
+    - RefreshTokens table for refresh token storage
+    """
+    conn = get_db_connection()
+    cur = conn.cursor()
+    # Get table definitions
+    tables = get_table_definitions()
+    # Create each table
+    for table_name, sql in tables.items():
+        cur.execute(sql)
+    conn.commit()
+    conn.close()
+def ensure_database_initialized():
+    """
+    Ensure database is initialized (thread-safe)
+    Call this from Flask app startup to initialize database once.
+    Controlled by RUN_INIT_DB environment variable.
+    """
+    global _db_init_done
+    should_init = os.getenv("RUN_INIT_DB", "0") == "1"
+    if should_init and not _db_init_done:
+        with _db_init_lock:
+            if not _db_init_done:
+                try:
+                    init_db()
+                    print("? Database initialized successfully")
+                    return True
+                except Exception as e:
+                    print(f"? Database initialization failed: {e}")
+                    raise
+                finally:
+                    _db_init_done = True
+    return _db_init_done
+def get_database_info():
+    """
+    Get database diagnostic information (admin only)
+    Returns safe diagnostic information without exposing credentials.
+    """
+    info = {}
+    # Get available drivers
+    try:
+        info["drivers_found"] = pyodbc.drivers()
+    except Exception as e:
+        info["drivers_found_error"] = str(e)
+    # Safe database information
+    info["database_name"] = DB_DATABASE
+    info["server_type"] = "LocalDB" if is_local else "Remote"
+    # Test connection
+    try:
+        conn = get_db_connection()
+        conn.close()
+        info["connection_status"] = "ok"
+    except Exception as e:
+        info["connection_status"] = "error"
+        info["error_type"] = type(e).__name__
+    return info
+def test_database_connection():
+    """
+    Test database connection and return status
+    Returns:
+        tuple: (success: bool, message: str)
+    """
+    try:
+        conn = get_db_connection()
+        # Test basic query
+        cur = conn.cursor()
+        cur.execute("SELECT 1")
+        result = cur.fetchone()
+        conn.close()
+        if result and result[0] == 1:
+            return True, "Database connection successful"
+        else:
+            return False, "Database query failed"
+    except Exception as e:
+        return False, f"Database connection failed: {str(e)}"

auth/models.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+Database models and schemas for authentication system
+Contains:
+- User model with role-based access
+- Token blacklist model
+- Refresh token model
+- Database table definitions
+"""
+import pyodbc
+from typing import Optional, Dict, Any
+class User:
+    """User model for authentication and authorization"""
+    def __init__(self, username: str, password_hash: str, role: str = 'user', user_id: int = None):
+        self.id = user_id
+        self.username = username
+        self.password_hash = password_hash
+        self.role = role
+    @staticmethod
+    def find_by_username(conn: pyodbc.Connection, username: str) -> Optional['User']:
+        """Find user by username"""
+        cur = conn.cursor()
+        cur.execute("SELECT id, username, password_hash, role FROM Users WHERE username = ?", (username,))
+        row = cur.fetchone()
+        if row:
+            return User(
+                user_id=row[0],
+                username=row[1],
+                password_hash=row[2],
+                role=row[3]
+            )
+        return None
+    @staticmethod
+    def create_user(conn: pyodbc.Connection, username: str, password_hash: str, role: str = 'user') -> bool:
+        """Create a new user"""
+        try:
+            cur = conn.cursor()
+            cur.execute(
+                "INSERT INTO Users (username, password_hash, role) VALUES (?, ?, ?)",
+                (username, password_hash, role)
+            )
+            conn.commit()
+            return True
+        except pyodbc.IntegrityError:
+            return False
+    @staticmethod
+    def get_all_users(conn: pyodbc.Connection) -> list:
+        """Get all users (admin only)"""
+        cur = conn.cursor()
+        cur.execute("SELECT id, username, role FROM Users ORDER BY id")
+        users = []
+        for row in cur.fetchall():
+            users.append({
+                "id": row[0],
+                "username": row[1],
+                "role": row[2]
+            })
+        return users
+    @staticmethod
+    def promote_to_admin(conn: pyodbc.Connection, username: str) -> bool:
+        """Promote user to admin role"""
+        cur = conn.cursor()
+        cur.execute("UPDATE Users SET role = 'admin' WHERE username = ?", (username,))
+        conn.commit()
+        return cur.rowcount > 0
+    @staticmethod
+    def user_count(conn: pyodbc.Connection) -> int:
+        """Get total user count"""
+        cur = conn.cursor()
+        cur.execute("SELECT COUNT(*) FROM Users")
+        return cur.fetchone()[0]
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert user to dictionary (safe for JSON)"""
+        return {
+            "id": self.id,
+            "username": self.username,
+            "role": self.role
+            # Note: Never include password_hash in dict
+        }
+class BlacklistedToken:
+    """Model for blacklisted JWT tokens"""
+    @staticmethod
+    def is_blacklisted(conn: pyodbc.Connection, token: str) -> bool:
+        """Check if token is blacklisted"""
+        cur = conn.cursor()
+        cur.execute("SELECT token FROM BlacklistedTokens WHERE token = ?", (token,))
+        return cur.fetchone() is not None
+    @staticmethod
+    def add_to_blacklist(conn: pyodbc.Connection, token: str) -> bool:
+        """Add token to blacklist"""
+        cur = conn.cursor()
+        # Check if already blacklisted
+        cur.execute("SELECT token FROM BlacklistedTokens WHERE token = ?", (token,))
+        if cur.fetchone():
+            return True  # Already blacklisted
+        cur.execute("INSERT INTO BlacklistedTokens (token) VALUES (?)", (token,))
+        conn.commit()
+        return True
+class RefreshToken:
+    """Model for refresh token management"""
+    @staticmethod
+    def find_by_token(conn: pyodbc.Connection, token: str) -> Optional[str]:
+        """Find username by refresh token"""
+        cur = conn.cursor()
+        cur.execute("SELECT username FROM RefreshTokens WHERE token = ?", (token,))
+        row = cur.fetchone()
+        return row[0] if row else None
+    @staticmethod
+    def create_token(conn: pyodbc.Connection, username: str, token: str) -> bool:
+        """Store refresh token"""
+        cur = conn.cursor()
+        cur.execute("INSERT INTO RefreshTokens (username, token) VALUES (?, ?)", (username, token))
+        conn.commit()
+        return True
+    @staticmethod
+    def delete_user_tokens(conn: pyodbc.Connection, username: str) -> bool:
+        """Delete all refresh tokens for user"""
+        cur = conn.cursor()
+        cur.execute("DELETE FROM RefreshTokens WHERE username = ?", (username,))
+        conn.commit()
+        return True
+# Database table creation SQL
+def get_table_definitions():
+    """Get SQL statements for creating authentication tables"""
+    return {
+        'users': """
+            IF OBJECT_ID('Users', 'U') IS NULL
+            CREATE TABLE Users (
+                id INT IDENTITY(1,1) PRIMARY KEY,
+                username NVARCHAR(100) UNIQUE NOT NULL,
+                password_hash NVARCHAR(500) NOT NULL,
+                role NVARCHAR(50) DEFAULT 'user'
+            )
+        """,
+        'blacklisted_tokens': """
+            IF OBJECT_ID('BlacklistedTokens', 'U') IS NULL
+            CREATE TABLE BlacklistedTokens (
+                id INT IDENTITY(1,1) PRIMARY KEY,
+                token NVARCHAR(1000) UNIQUE NOT NULL,
+                created_at DATETIME DEFAULT GETDATE()
+            )
+        """,
+        'refresh_tokens': """
+            IF OBJECT_ID('RefreshTokens', 'U') IS NULL
+            CREATE TABLE RefreshTokens (
+                id INT IDENTITY(1,1) PRIMARY KEY,
+                username NVARCHAR(100) NOT NULL,
+                token NVARCHAR(1000) UNIQUE NOT NULL,
+                created_at DATETIME DEFAULT GETDATE(),
+                FOREIGN KEY (username) REFERENCES Users(username) ON DELETE CASCADE
+            )
+        """
+    }

auth/routes.py ADDED Viewed

	@@ -0,0 +1,346 @@

+"""
+Authentication routes and endpoints
+Contains all authentication-related Flask routes:
+- User registration and login
+- Token refresh and logout
+- Admin user management
+- Database diagnostics
+"""
+import os
+import datetime
+import bcrypt
+import jwt
+import pyodbc
+from flask import Blueprint, request, jsonify, make_response, current_app
+from .database import get_db_connection
+from .models import User, BlacklistedToken, RefreshToken
+from .utils import (
+    token_required,
+    anonymize_username,
+    add_cookie,
+    validate_user_input,
+    is_admin_user,
+    log_security_event
+)
+# Create authentication blueprint
+auth_bp = Blueprint('auth', __name__)
+@auth_bp.route("/dashboard")
+@token_required
+def dashboard(username):
+    """Protected dashboard endpoint"""
+    return jsonify({"message": f"Welcome {username} to your dashboard!"})
+@auth_bp.route("/login", methods=["POST"])
+def login():
+    """User login endpoint"""
+    data = request.json or {}
+    username = data.get('username', '').strip()
+    password = data.get('password', '')
+    # Input validation
+    is_valid, error_msg = validate_user_input(username, password)
+    if not is_valid:
+        return jsonify({"message": error_msg}), 400
+    # Normalize username to prevent case sensitivity issues
+    username = username.lower()
+    try:
+        conn = get_db_connection()
+        user = User.find_by_username(conn, username)
+        conn.close()
+    except Exception as e:
+        current_app.logger.exception("DB access error on login: %s", e)
+        return jsonify({"message": "Database is unavailable"}), 503
+    if not user:
+        log_security_event("failed_login", username, request.remote_addr, "user_not_found")
+        return jsonify({"message": "Invalid credentials"}), 401
+    if not bcrypt.checkpw(password.encode('utf-8'), user.password_hash.encode('utf-8')):
+        log_security_event("failed_login", username, request.remote_addr, "wrong_password")
+        return jsonify({"message": "Invalid credentials"}), 401
+    # Successful login
+    log_security_event("successful_login", username, request.remote_addr)
+    # Generate tokens
+    access_token = jwt.encode(
+        {'username': username, 'exp': datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(minutes=15)},
+        current_app.config['SECRET_KEY'],
+        algorithm="HS256"
+    )
+    refresh_token = jwt.encode(
+        {'username': username, 'exp': datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(days=7)},
+        current_app.config['SECRET_KEY'],
+        algorithm="HS256"
+    )
+    # Store refresh token
+    try:
+        conn = get_db_connection()
+        RefreshToken.create_token(conn, username, refresh_token)
+        conn.close()
+    except Exception as e:
+        current_app.logger.exception("DB write error on login: %s", e)
+        return jsonify({"message": "Database is unavailable"}), 503
+    resp = make_response(jsonify({"message": "Login successful"}))
+    add_cookie(resp, 'access_token', access_token, 900)                 # 15 min
+    add_cookie(resp, 'refresh_token', refresh_token, 7*24*60*60)       # 7 days
+    return resp
+@auth_bp.route("/refresh", methods=["POST"])
+def refresh():
+    """Token refresh endpoint"""
+    refresh_token = request.cookies.get("refresh_token")
+    if not refresh_token:
+        return jsonify({'message': 'Refresh token is missing'}), 400
+    try:
+        payload = jwt.decode(refresh_token, current_app.config['SECRET_KEY'], algorithms=["HS256"])
+    except jwt.ExpiredSignatureError:
+        return jsonify({'message': 'Refresh token has expired'}), 401
+    except jwt.InvalidTokenError:
+        return jsonify({'message': 'Invalid refresh token'}), 401
+    try:
+        conn = get_db_connection()
+        username = RefreshToken.find_by_token(conn, refresh_token)
+        conn.close()
+    except Exception as e:
+        current_app.logger.exception("DB access error on refresh: %s", e)
+        return jsonify({"message": "Database is unavailable"}), 503
+    if not username:
+        return jsonify({'message': 'Invalid refresh token'}), 401
+    # Generate new access token
+    new_access = jwt.encode(
+        {'username': username, 'exp': datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(minutes=15)},
+        current_app.config['SECRET_KEY'],
+        algorithm="HS256"
+    )
+    resp = make_response(jsonify({'access_token': new_access}))
+    add_cookie(resp, 'access_token', new_access, 900)
+    return resp
+@auth_bp.route("/logout", methods=["POST"])
+@token_required
+def logout(username):
+    """User logout endpoint"""
+    token = request.cookies.get('access_token')
+    if not token:
+        return jsonify({"message": "Invalid token format"}), 401
+    try:
+        conn = get_db_connection()
+        # Add to blacklist
+        BlacklistedToken.add_to_blacklist(conn, token)
+        # Delete refresh tokens
+        RefreshToken.delete_user_tokens(conn, username)
+        conn.close()
+        log_security_event("logout", username, request.remote_addr)
+    except Exception as e:
+        current_app.logger.exception("DB write error on logout: %s", e)
+        return jsonify({"message": "Database is unavailable"}), 503
+    resp = make_response(jsonify({"message": "Logged out successfully!"}))
+    resp.delete_cookie('access_token', path='/')
+    resp.delete_cookie('refresh_token', path='/')
+    return resp
+@auth_bp.route("/check-auth", methods=["GET"])
+@token_required
+def check_auth(username):
+    """Check authentication status"""
+    return jsonify({"message": "Authenticated", "username": username}), 200
+@auth_bp.route("/signup", methods=["POST"])
+def signup():
+    """User registration endpoint"""
+    data = request.json or {}
+    username = data.get('username', '').strip()
+    password = data.get('password', '')
+    # Input validation
+    is_valid, error_msg = validate_user_input(username, password)
+    if not is_valid:
+        return jsonify({"message": error_msg}), 400
+    # Normalize username (prevent duplicates like "Admin" and "admin")
+    username = username.lower()
+    try:
+        conn = get_db_connection()
+        # Check if username already exists
+        if User.find_by_username(conn, username):
+            conn.close()
+            return jsonify({"message": "Username already exists"}), 409
+        # Hash password
+        password_hash = bcrypt.hashpw(password.encode('utf-8'), bcrypt.gensalt())
+        # Create new user
+        if User.create_user(conn, username, password_hash.decode('utf-8')):
+            conn.close()
+            log_security_event("user_registered", username, request.remote_addr)
+            return jsonify({"message": "User registered successfully"}), 201
+        else:
+            conn.close()
+            return jsonify({"message": "Username already exists"}), 409
+    except Exception as e:
+        current_app.logger.exception("DB error on signup: %s", e)
+        return jsonify({"message": "Database is unavailable"}), 503
+@auth_bp.route("/admin/promote-user", methods=["POST"])
+@token_required
+def promote_user(username):
+    """Promote a user to admin role - ADMIN ONLY"""
+    try:
+        conn = get_db_connection()
+        # Check if current user is admin
+        if not is_admin_user(conn, username):
+            conn.close()
+            log_security_event("unauthorized_access", username, request.remote_addr, "promote-user")
+            return jsonify({"message": "Unauthorized - Admin access required"}), 403
+        # Get target username from request
+        data = request.json or {}
+        target_user = data.get('username', '').strip().lower()
+        if not target_user:
+            conn.close()
+            return jsonify({"message": "Username is required"}), 400
+        # Check if target user exists
+        target_user_obj = User.find_by_username(conn, target_user)
+        if not target_user_obj:
+            conn.close()
+            return jsonify({"message": "User not found"}), 404
+        if target_user_obj.role == 'admin':
+            conn.close()
+            return jsonify({"message": "User is already an admin"}), 400
+        # Promote user to admin
+        if User.promote_to_admin(conn, target_user):
+            conn.close()
+            log_security_event("user_promoted", username, request.remote_addr, f"promoted {target_user}")
+            return jsonify({"message": f"User {target_user} promoted to admin successfully"}), 200
+        else:
+            conn.close()
+            return jsonify({"message": "Failed to promote user"}), 500
+    except Exception as e:
+        current_app.logger.exception("DB error in promote-user: %s", e)
+        return jsonify({"message": "Database is unavailable"}), 503
+@auth_bp.route("/admin/users", methods=["GET"])
+@token_required
+def list_users(username):
+    """List all users - ADMIN ONLY"""
+    try:
+        conn = get_db_connection()
+        # Check if current user is admin
+        if not is_admin_user(conn, username):
+            conn.close()
+            log_security_event("unauthorized_access", username, request.remote_addr, "list-users")
+            return jsonify({"message": "Unauthorized - Admin access required"}), 403
+        # Get all users
+        users = User.get_all_users(conn)
+        conn.close()
+        log_security_event("admin_action", username, request.remote_addr, "viewed_user_list")
+        return jsonify({"users": users, "total": len(users)}), 200
+    except Exception as e:
+        current_app.logger.exception("DB error in list-users: %s", e)
+        return jsonify({"message": "Database is unavailable"}), 503
+@auth_bp.route("/admin/create-first-admin", methods=["POST"])
+def create_first_admin():
+    """Create the first admin user - ONLY if no users exist"""
+    try:
+        conn = get_db_connection()
+        # Check if any users exist
+        if User.user_count(conn) > 0:
+            conn.close()
+            return jsonify({"message": "Users already exist. Cannot create first admin."}), 409
+        # Create first admin user
+        username = "admin"
+        password = "admin123"  # Should be changed immediately
+        # Hash password
+        password_hash = bcrypt.hashpw(password.encode('utf-8'), bcrypt.gensalt())
+        # Create admin user
+        if User.create_user(conn, username, password_hash.decode('utf-8'), 'admin'):
+            conn.close()
+            log_security_event("first_admin_created", "system", request.remote_addr)
+            return jsonify({
+                "message": "First admin user created successfully",
+                "username": "admin",
+                "password": "admin123",
+                "warning": "CHANGE THE PASSWORD IMMEDIATELY!"
+            }), 201
+        else:
+            conn.close()
+            return jsonify({"message": "Failed to create admin user"}), 500
+    except Exception as e:
+        current_app.logger.exception("DB error creating first admin: %s", e)
+        return jsonify({"message": "Database is unavailable"}), 503
+@auth_bp.route("/db/diag", methods=["GET"])
+@token_required
+def db_diag(username):
+    """Database diagnostics - ADMIN ONLY"""
+    try:
+        conn = get_db_connection()
+        # Security: Only allow admin users to access diagnostic information
+        if not is_admin_user(conn, username):
+            conn.close()
+            log_security_event("unauthorized_access", username, request.remote_addr, "db-diag")
+            return jsonify({"message": "Unauthorized - Admin access required"}), 403
+        conn.close()
+    except Exception as e:
+        current_app.logger.exception("DB access error in db_diag: %s", e)
+        return jsonify({"message": "Database is unavailable"}), 503
+    # Proceed with diagnostics for admin users only
+    from .database import get_database_info
+    info = get_database_info()
+    log_security_event("admin_action", username, request.remote_addr, "accessed_db_diagnostics")
+    return jsonify(info), 200

auth/utils.py ADDED Viewed

	@@ -0,0 +1,156 @@

+"""
+Authentication utilities and security functions
+Contains:
+- JWT token validation decorator
+- Security helpers
+- Username anonymization for logging
+- Cookie management utilities
+"""
+import os
+import jwt
+import hashlib
+from functools import wraps
+from flask import request, jsonify, current_app, make_response
+from .database import get_db_connection
+from .models import BlacklistedToken
+def anonymize_username(username):
+    """Create anonymous hash for logging while preserving uniqueness"""
+    if not username:
+        return "anonymous"
+    return hashlib.sha256(f"user_{username}_salt".encode()).hexdigest()[:12]
+def token_required(f):
+    """
+    JWT token validation decorator
+    Validates access token from cookies and checks blacklist.
+    Returns username to the decorated function.
+    """
+    @wraps(f)
+    def decorated(*args, **kwargs):
+        token = request.cookies.get('access_token')
+        if not token:
+            return jsonify({"message": "Token is missing"}), 401
+        try:
+            # Check blacklist
+            conn = get_db_connection()
+            if BlacklistedToken.is_blacklisted(conn, token):
+                conn.close()
+                return jsonify({"message": "Token has been revoked. Please log in again."}), 401
+            conn.close()
+            # Decode and validate token
+            data = jwt.decode(token, current_app.config['SECRET_KEY'], algorithms=["HS256"])
+            return f(data['username'], *args, **kwargs)
+        except jwt.ExpiredSignatureError:
+            return jsonify({"message": "Token has expired"}), 401
+        except jwt.InvalidTokenError:
+            return jsonify({"message": "Invalid token"}), 401
+        except Exception as e:
+            current_app.logger.exception("Auth error: %s", e)
+            return jsonify({"message": "Server error"}), 500
+    return decorated
+def extract_username_from_request(req) -> str | None:
+    """
+    Extract username from various sources in request
+    Checks in order:
+    1. X-User header
+    2. Request body JSON
+    3. JWT cookie
+    """
+    # 1) Header
+    hdr = req.headers.get("X-User")
+    if hdr:
+        return hdr
+    # 2) Body
+    data = req.get_json(silent=True) or {}
+    if data.get("username"):
+        return data.get("username")
+    # 3) JWT cookie
+    token = req.cookies.get("access_token")
+    if token:
+        try:
+            payload = jwt.decode(token, current_app.config["SECRET_KEY"], algorithms=["HS256"])
+            return payload.get("username")
+        except jwt.ExpiredSignatureError:
+            return None
+        except jwt.InvalidTokenError:
+            return None
+    return None
+def add_cookie(resp, name: str, value: str, max_age: int):
+    """
+    Add secure cookie to response
+    In prod: Secure + SameSite=None + Partitioned (works with third-party cookie protections).
+    In dev: SameSite=Lax, not Secure.
+    """
+    IS_PROD = os.getenv("ENV", "dev").lower() == "prod"
+    if IS_PROD:
+        resp.headers.add(
+            "Set-Cookie",
+            f"{name}={value}; Path=/; Max-Age={max_age}; Secure; HttpOnly; SameSite=None; Partitioned"
+        )
+    else:
+        resp.set_cookie(name, value, httponly=True, secure=False, samesite="Lax", max_age=max_age, path="/")
+def validate_user_input(username: str, password: str) -> tuple[bool, str]:
+    """
+    Validate user input for signup/login
+    Returns: (is_valid, error_message)
+    """
+    if not username or not password:
+        return False, "Username and password are required"
+    if len(username) < 3 or len(username) > 50:
+        return False, "Username must be 3-50 characters"
+    if len(password) < 8:
+        return False, "Password must be at least 8 characters"
+    # Additional validation can be added here
+    # - Special character requirements
+    # - Username format validation
+    # - Password complexity checks
+    return True, ""
+def is_admin_user(conn, username: str) -> bool:
+    """Check if user has admin role"""
+    from .models import User
+    user = User.find_by_username(conn, username)
+    return user is not None and user.role == 'admin'
+def log_security_event(event_type: str, username: str, ip_address: str, details: str = ""):
+    """
+    Log security events with anonymized usernames
+    Args:
+        event_type: Type of security event (login, logout, failed_login, etc.)
+        username: Username (will be anonymized)
+        ip_address: Request IP address
+        details: Additional details about the event
+    """
+    user_hash = anonymize_username(username)
+    current_app.logger.info(
+        f"Security Event [{event_type}]: user_hash={user_hash}, ip={ip_address}, details={details}"
+    )

build_chroma_db.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import json
+import chromadb
+# ==============================
+# CONFIG
+# ==============================
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+JSON_FILE = os.path.join(BASE_DIR, "assets/teacher_feedback_sentences_category.json")
+CHROMA_DIR = os.path.join(BASE_DIR, "assets/chroma_db")
+COLLECTION_NAME = "feedback"
+def safe_float(x):
+    """Convert '000.000' or 124.944 to float."""
+    try:
+        return float(x)
+    except:
+        return 0.0
+def load_segments(json_path):
+    with open(json_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def build_chroma():
+    segments = load_segments(JSON_FILE)
+    # Create Chroma client
+    client = chromadb.PersistentClient(path=CHROMA_DIR)
+    collection = client.get_or_create_collection(COLLECTION_NAME)
+    # OPTIONAL: clear existing db (recommended if you already inserted wrong)
+    existing = collection.get()
+    existing_ids = existing.get("ids", [])
+    if existing_ids:
+        collection.delete(ids=existing_ids)
+        print(f"✅ Deleted old entries: {len(existing_ids)}")
+    ids = []
+    documents = []
+    metadatas = []
+    for seg in segments:
+        seg_id = seg.get("id")
+        text = seg.get("text", "").strip()
+        category = seg.get("category", "").strip()
+        video_file = seg.get("video_file", "").strip()
+        start = safe_float(seg.get("start"))
+        end = safe_float(seg.get("end"))
+        # metadata for chroma
+        meta = {
+            "category": category,
+            "video_file": video_file,
+            "start": start,
+            "end": end,
+        }
+        # store phoneme only if exists
+        if "phoneme" in seg and seg["phoneme"]:
+            meta["phoneme"] = seg["phoneme"].strip()
+        ids.append(seg_id)
+        documents.append(text)
+        metadatas.append(meta)
+    # Insert into ChromaDB
+    collection.add(
+        ids=ids,
+        documents=documents,
+        metadatas=metadatas
+    )
+    print("\n✅ ChromaDB created successfully!")
+    print(f"Total inserted: {len(ids)}")
+    # quick stats
+    vowels = [m for m in metadatas if m["category"] == "vowel"]
+    vowel_specific = [m for m in vowels if m.get("phoneme")]
+    consonants = [m for m in metadatas if m["category"] == "consonant"]
+    consonant_specific = [m for m in consonants if m.get("phoneme")]
+    print(f"Vowel total: {len(vowels)} | vowel specific: {len(vowel_specific)}")
+    print(f"Consonant total: {len(consonants)} | consonant specific: {len(consonant_specific)}")
+if __name__ == "__main__":
+    build_chroma()

chat.py DELETED Viewed

@@ -1,246 +0,0 @@
-from flask import Flask, jsonify, send_file, abort, make_response, request, Blueprint, current_app
-from flask_cors import CORS
-import os
-print(f"GOOGLE_APPLICATION_CREDENTIALS: {os.getenv('GOOGLE_APPLICATION_CREDENTIALS')}")
-import io
-import uuid
-import requests
-import re
-import tempfile  # needed by validate-pronounce
-app = Flask(__name__)
-CORS(app)
-# 👇 Add the helper right here
-def _cohere_headers():
-    api_key = current_app.config.get("COHERE_API_KEY") or COHERE_API_KEY
-    return {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-    }
-@app.route('/')
-def home():
-    return "Welcome to the Flask app! The server is running."
-# API configuration for AI-based question generation
-COHERE_API_KEY = os.getenv("COHERE_API_KEY", "")
-# (1) UPDATED URL: v2 endpoint on api.cohere.com
-COHERE_API_URL = 'https://api.cohere.com/v2/chat'
-# Dictionary to store user conversations
-user_sessions = {}
-# Endpoint to explain grammar topics
-movie_bp = Blueprint("movie", __name__)
-def _extract_text_v2(resp_json: dict) -> str:
-    """
-    v2 /chat returns:
-    { "message": { "content": [ { "type": "text", "text": "..." } ] } }
-    """
-    msg = resp_json.get("message", {})
-    content = msg.get("content", [])
-    if isinstance(content, list) and content:
-        block = content[0]
-        if isinstance(block, dict):
-            return (block.get("text") or "").strip()
-    return ""
-def _cohere_generate(prompt: str, max_tokens: int = 1000, temperature: float = 0.7):
-    api_key = current_app.config.get("COHERE_API_KEY") or COHERE_API_KEY
-    if not api_key:
-        return None, ("COHERE_API_KEY not set on the server", 500)
-    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
-    # (2) UPDATED PAYLOAD: use messages instead of prompt
-    payload = {
-        "model": "command-r-08-2024",
-        "messages": [
-            {"role": "user", "content": prompt}
-        ],
-        "max_tokens": max_tokens,
-        "temperature": temperature
-    }
-    try:
-        r = requests.post(COHERE_API_URL, headers=headers, json=payload, timeout=30)
-        if r.status_code != 200:
-            return None, (f"Cohere API error: {r.text}", 502)
-        # (3) UPDATED PARSING: read message.content[0].text
-        text = _extract_text_v2(r.json())
-        return text, None
-    except Exception as e:
-        current_app.logger.exception("Cohere request failed: %s", e)
-        return None, ("Upstream request failed", 502)
-@movie_bp.post("/explain-grammar")
-def explain_grammar():
-    try:
-        data = request.get_json()
-        print("Received Data:", data)
-        topic = data.get('topic', '').strip()
-        session_id = data.get('session_id', str(uuid.uuid4()))  # Use provided session_id or create a new one
-        if not topic:
-            return jsonify({'error': 'Topic is required'}), 400
-        # Retrieve previous conversation history
-        conversation_history = user_sessions.get(session_id, [])
-        # Keep the last 10 messages to maintain better context (adjustable)
-        if len(conversation_history) > 10:
-            conversation_history = conversation_history[-10:]
-        # Generate a more **adaptive** prompt
-        context = "\n".join(conversation_history) if conversation_history else ""
-        prompt = f"""
-        You are a highly skilled grammar assistant. Your job is to maintain a **dynamic conversation** and respond intelligently based on user input, If the user asks something **unrelated to grammar**, respond with: "Please send a grammar-related question..
-        - Your answers must always **relate to the conversation history** and **extend naturally** based on what was previously asked.
-        - Your answers must be **concise, clear, and to the point**
-        - If the user asks for **examples**, explanations, or clarifications, **automatically infer** which topic they are referring to.
-        - If the user's question is **vague**, determine the most **logical continuation** based on prior questions.
-        - If the user asks something **unrelated to grammar**, respond with: "Please send a grammar-related question."
-        **Conversation so far:**
-        {context}
-        **User's new question:** {topic}
-        Please provide a **coherent and relevant answer** that continues the conversation naturally.
-        """
-        # Make the API call to Cohere
-        headers = {
-            'Authorization': f'Bearer {COHERE_API_KEY}',
-            'Content-Type': 'application/json'
-        }
-        # (2) UPDATED PAYLOAD: messages array
-        payload = {
-            'model': 'command-r-08-2024',
-            'messages': [
-                {'role': 'user', 'content': prompt}
-            ],
-            'max_tokens': 1000
-        }
-        response = requests.post(COHERE_API_URL, headers=headers, json=payload)
-        if response.status_code == 200:
-            # (3) UPDATED PARSING
-            ai_response = _extract_text_v2(response.json())
-            # Store conversation history to maintain context
-            conversation_history.append(f"User: {topic}\nAI: {ai_response}")
-            user_sessions[session_id] = conversation_history  # Update session history
-            return jsonify({'response': ai_response, 'session_id': session_id})
-        else:
-            return jsonify({'error': 'Failed to fetch data from Cohere API'}), 500
-    except Exception as e:
-        return jsonify({'error': str(e)}), 500
-@app.route('/suggest-grammar-questions', methods=['POST'])
-def suggest_grammar_questions():
-    try:
-        data = request.get_json()
-        user_input = data.get('input', '').strip()  # User's partial input (e.g., "What is v")
-        if not user_input:
-            return jsonify({'error': 'Input is required'}), 400
-        prompt = f"""
-            You are a grammar expert. Given the user's input "{user_input}", generate **3 natural grammar-related questions** that people might ask.
-            - The user's input is a **partial or full grammar-related query**.
-            - AI must **infer the most likely grammar topic** based on the input.
-            - AI must **ensure all suggestions are strictly related to English grammar**.
-            - **If the input is incomplete, intelligently complete it** with the most likely grammar concept.
-            - Ensure all **questions are fully formed and relevant**.
-            **User input:** "{user_input}"
-            Provide exactly 3 well-structured, grammar-related questions:
-            """
-        # Call Cohere API
-        headers = {
-            'Authorization': f'Bearer {COHERE_API_KEY}',
-            'Content-Type': 'application/json'
-        }
-        # (2) UPDATED PAYLOAD: messages array
-        payload = {
-            'model': 'command-r-08-2024',
-            'messages': [
-                {'role': 'user', 'content': prompt}
-            ],
-            'max_tokens': 100,
-            'temperature': 0.9
-        }
-        response = requests.post(COHERE_API_URL, headers=headers, json=payload)
-        if response.status_code == 200:
-            # (3) UPDATED PARSING
-            text = _extract_text_v2(response.json())
-            suggestions = [s for s in (text or "").split("\n") if s.strip()]
-            return jsonify({'suggestions': suggestions[:3]})
-            # keep exactly 3 if more lines present
-        else:
-            return jsonify({'error': 'Failed to fetch suggestions', 'details': response.text}), 500
-    except Exception as e:
-        return jsonify({'error': str(e)}), 500
-def validate_topic(topic):
-    validation_prompt = f"""
-    You are an AI grammar expert. Your task is to determine if a given topic is related to **English grammar** or not.
-    **Input:** "{topic}"
-    ### **Rules:**
-    - If the input is **in the form of a question** (e.g., it asks for an explanation or definition), return `"ask grammar topics"`, even if the topic is related to grammar.
-    - If the topic is **related to English grammar concepts** such as **parts of speech**, **verb tenses**, **sentence structure**, etc., return `"Grammar"`.
-    - If the topic is **not related to grammar**, such as general knowledge, science, math, history, or topics from other fields, return `"Not Grammar"`.
-    - Your response must be based purely on whether the topic relates to grammar, and **not** based on specific words, phrases, or examples.
-    **Your response must be exactly either "Grammar", "Not Grammar", or "ask grammar topics". No extra text.**
-    """
-    headers = {
-        'Authorization': f'Bearer {COHERE_API_KEY}',
-        'Content-Type': 'application/json'
-    }
-    # (2) UPDATED PAYLOAD: messages array
-    payload = {
-        'model': 'command-r-08-2024',
-        'messages': [
-            {'role': 'user', 'content': validation_prompt}
-        ],
-        'max_tokens': 5
-    }
-    try:
-        response = requests.post(COHERE_API_URL, json=payload, headers=headers)
-        # (3) UPDATED PARSING
-        validation_result = _extract_text_v2(response.json())
-        # Ensure the response is strictly "Grammar" or "Not Grammar" or "ask grammar topics"
-        if validation_result not in ["Grammar", "Not Grammar", "ask grammar topics"]:
-            return "Not Grammar"  # Fallback to avoid incorrect responses
-        return validation_result
-    except Exception as e:
-        return f"Error: {str(e)}"
-if __name__ == '__main__':
-    # app.run(debug=True)
-    app.register_blueprint(movie_bp, url_prefix='')  # expose /explain-grammar locally
-    app.run(host='0.0.0.0', port=5012, debug=True)

chroma_db/44944ef3-9b61-4c1b-bc5e-6a49750c0c54/length.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f8d329104353429c3a4fab240f87e7cba8ac17269bbfe57d26150d03cb34fa0a
-size 400

chroma_db/44944ef3-9b61-4c1b-bc5e-6a49750c0c54/link_lists.bin DELETED Viewed

File without changes

findingword.py DELETED Viewed

@@ -1,276 +0,0 @@
-import openai
-from flask import Flask, jsonify, request, send_from_directory, send_file, Blueprint, current_app, url_for
-import os
-from flask_cors import CORS
-import io  # for streaming S3 bytes in HF/AWS mode
-# Optional (only used in AWS mode)
-try:
-    import boto3
-    from botocore.exceptions import BotoCoreError, ClientError
-except Exception:
-    # Not required for local; will be imported dynamically in AWS mode
-    boto3 = None
-    BotoCoreError = ClientError = Exception
-app = Flask(__name__)
-CORS(app)
-# --- Blueprint ---
-finding_bp = Blueprint("findingword", __name__)
-# Directories for video, audio, and transcripts
-VIDEO_FOLDER = 'static/videos'
-AUDIO_FOLDER = 'static/audio'   # used only in local mode
-TRANSCRIPT_FOLDER = 'static/transcripts'
-# --- OpenAI key handling (same as vocab builder) ---
-_OPENAI_API_KEY_FALLBACK = os.getenv("OPENAI_API_KEY", "")
-def _ensure_openai_key():
-    """Set openai.api_key from Flask config or env before each API call."""
-    api_key = (current_app.config.get("OPENAI_API_KEY") if current_app else None) or _OPENAI_API_KEY_FALLBACK
-    if api_key:
-        openai.api_key = api_key
-# ---------------------- audio-mode helpers ----------------------
-def _is_aws_mode() -> bool:
-    """
-    Switch to AWS Polly + S3 on Hugging Face / prod.
-    Local stays on Google TTS + disk.
-    """
-    if os.getenv("USE_AWS_AUDIO", "0") == "1":
-        return True
-    if os.getenv("SPACE_ID"):  # set on Hugging Face Spaces
-        return True
-    if os.getenv("ENV", "dev").lower() == "prod":
-        return True
-    return False
-def _sanitize_filename(word: str) -> str:
-    # Keep your current style but ensure safe S3 key/filename
-    return word.strip().replace(" ", "_").replace(".", "").lower()
-# ---------------------------------------------------------------------
-@finding_bp.route('/generate-vocabulary', methods=['GET'])
-def get_vocabulary_word_from_openai():
-    prompt = (
-        "Pick a simple vocabulary word suitable for children (ages 6–8) "
-        "and provide its meaning in very easy English. Do not repeat words from previous responses. "
-        "Format: 'Word: [word]. Meaning: [meaning].'"
-    )
-    try:
-        _ensure_openai_key()
-        response = openai.chat.completions.create(
-            model="gpt-3.5-turbo",
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant."},
-                {"role": "user", "content": prompt},
-            ]
-        )
-        result = response.choices[0].message.content.strip()
-        print(f"Full Response: {result}")
-        if "Word:" in result and "Meaning:" in result:
-            parts = result.split("Meaning:")
-            word = parts[0].replace("Word:", "").strip()
-            word = word.rstrip('.')  # avoid trailing dot
-            meaning = parts[1].strip()
-            # Generate the sentence
-            sentence = generate_sentence(word, meaning)
-            # Generate audio file for the vocabulary word
-            audio_file_path_or_name = generate_audio(word)  # local path or just filename in AWS mode
-            # URL for frontend remains identical
-            # audio_url = f"/static/audio/{os.path.basename(audio_file_path_or_name)}"
-            audio_url = url_for("findingword.serve_audio",
-                    filename=os.path.basename(audio_file_path_or_name))
-            return jsonify({
-                "word": word,
-                "meaning": meaning,
-                "sentence": sentence,
-                "audio_file_path": audio_url
-            })
-        else:
-            return jsonify({"response": result, "message": "Meaning not provided in the expected format"})
-    except Exception as e:
-        return jsonify({"error": str(e)}), 500
-def generate_sentence(word, meaning):
-    prompt = f"Create a sentence using the word '{word}' that fully demonstrates its meaning: {meaning}"
-    _ensure_openai_key()
-    response = openai.chat.completions.create(
-        model="gpt-3.5-turbo",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": prompt},
-        ]
-    )
-    sentence = response.choices[0].message.content.strip()
-    return sentence
-def generate_audio(word):
-    """
-    Local (default): Google TTS → write MP3 to ./static/audio/<word>.mp3 → return full path.
-    Hugging Face / AWS mode: Polly → upload to S3 (findingword/<word>.mp3) → return just the filename,
-    and let /static/audio/<filename> stream from S3 (see route below).
-    """
-    sanitized_word = _sanitize_filename(word)
-    filename = f"{sanitized_word}.mp3"
-    if _is_aws_mode():
-        # ---- AWS Polly + S3 path (no local write) ----
-        if boto3 is None:
-            raise RuntimeError("boto3 is required in AWS audio mode but not available")
-        region = os.getenv("AWS_DEFAULT_REGION", "eu-north-1")
-        bucket = os.getenv("S3_BUCKET_NAME")
-        if not bucket:
-            raise RuntimeError("S3_BUCKET_NAME is not set")
-        polly = boto3.client("polly", region_name=region)
-        s3 = boto3.client("s3", region_name=region)
-        try:
-            resp = polly.synthesize_speech(
-                Text=word,
-                OutputFormat="mp3",
-                VoiceId=os.getenv("POLLY_VOICE_ID", "Joanna"),
-                Engine=os.getenv("POLLY_ENGINE", "standard"),
-                LanguageCode="en-US",
-            )
-            stream = resp.get("AudioStream")
-            if not stream:
-                raise RuntimeError("Polly returned no AudioStream")
-            audio_bytes = stream.read()
-        except (BotoCoreError, ClientError, Exception) as e:
-            raise RuntimeError(f"Polly TTS failed: {e}")
-        key = f"findingword/{filename}"
-        try:
-            s3.put_object(Bucket=bucket, Key=key, Body=audio_bytes, ContentType="audio/mpeg")
-        except (BotoCoreError, ClientError, Exception) as e:
-            raise RuntimeError(f"S3 upload failed: {e}")
-        # Return only the filename; /static/audio/<filename> will proxy from S3
-        return filename
-    # ---- Local Google TTS path (lazy import; create dir here only) ----
-    audio_dir = AUDIO_FOLDER
-    try:
-        os.makedirs(audio_dir, exist_ok=True)
-    except Exception:
-        # Fallback if CWD is restricted
-        audio_dir = "/tmp/audio"
-        os.makedirs(audio_dir, exist_ok=True)
-    audio_file_path = os.path.join(audio_dir, filename)
-    if not os.path.exists(audio_file_path):
-        try:
-            # Import only in local mode to avoid HF credential errors
-            from google.cloud import texttospeech
-            gcp_client = texttospeech.TextToSpeechClient()
-        except Exception as e:
-            raise RuntimeError(
-                "Google TTS is required in local mode but missing. "
-                "Install google-cloud-texttospeech and set GOOGLE_APPLICATION_CREDENTIALS. "
-                f"Details: {e}"
-            )
-        synthesis_input = texttospeech.SynthesisInput(text=word)
-        voice = texttospeech.VoiceSelectionParams(
-            language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
-        )
-        audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
-        response = gcp_client.synthesize_speech(
-            input=synthesis_input, voice=voice, audio_config=audio_config
-        )
-        with open(audio_file_path, "wb") as out:
-            out.write(response.audio_content)
-        print(f"✅ Audio saved: {audio_file_path}")
-    return audio_file_path
-@finding_bp.route('/validate-word', methods=['POST'])
-def validate_word():
-    try:
-        data = request.get_json()
-        print("📥 Received data for validation:", data)
-        if not data or 'user_input' not in data or 'correct_word' not in data:
-            return jsonify({"error": "Invalid request, missing fields"}), 400
-        user_input = data.get('user_input', '').strip()
-        correct_word = data.get('correct_word', '').strip()
-        if user_input.lower() == correct_word.lower():
-            return jsonify({"status": "success", "message": "Correct! You typed the word correctly."})
-        else:
-            return jsonify({"status": "failure", "message": f"Incorrect. The correct word was '{correct_word}'."})
-    except Exception as e:
-        return jsonify({"error": str(e)}), 500
-@finding_bp.route('/static/audio/<filename>')
-def serve_audio(filename):
-    """
-    Local: serve from disk.
-    AWS mode (HF): fetch the object from S3 and stream it (no local storage).
-    """
-    if _is_aws_mode():
-        if boto3 is None:
-            return jsonify({"error": "boto3 missing in AWS mode"}), 500
-        region = os.getenv("AWS_DEFAULT_REGION", "eu-north-1")
-        bucket = os.getenv("S3_BUCKET_NAME")
-        if not bucket:
-            return jsonify({"error": "S3_BUCKET_NAME not set"}), 500
-        s3 = boto3.client("s3", region_name=region)
-        key = f"findingword/{filename}"
-        try:
-            obj = s3.get_object(Bucket=bucket, Key=key)
-            data = obj["Body"].read()
-            return send_file(
-                io.BytesIO(data),
-                mimetype="audio/mpeg",
-                download_name=filename,
-                as_attachment=False
-            )
-        except (BotoCoreError, ClientError, Exception) as e:
-            return jsonify({"error": f"S3 fetch failed: {str(e)}"}), 404
-    # Local: serve file from disk as before (with /tmp fallback)
-    local_path = os.path.join(AUDIO_FOLDER, filename)
-    if os.path.exists(local_path):
-        return send_from_directory(AUDIO_FOLDER, filename)
-    alt_dir = "/tmp/audio"
-    alt_path = os.path.join(alt_dir, filename)
-    if os.path.exists(alt_path):
-        return send_from_directory(alt_dir, filename)
-    return jsonify({"error": "File not found"}), 404
-# Run the Flask server (local dev): keep URLs unchanged by registering with empty prefix
-if __name__ == '__main__':
-    app.register_blueprint(finding_bp, url_prefix='')  # Local: /generate-vocabulary, /validate-word, /static/audio/...
-    app.run(host='0.0.0.0', port=5005, debug=True)

generateQuestion.py DELETED Viewed

@@ -1,535 +0,0 @@
-"""
-Grammar Question Generation and Validation Module
-This module provides endpoints for:
-- Generating fill-in-the-blank grammar questions at various difficulty levels
-- Batch validating user answers with AI-powered feedback
-- Providing hints for incorrect answers
-All AI operations are powered by Cohere's API v2.
-"""
-import logging
-import os
-from typing import Optional, Dict, Any, List
-import requests
-from flask import Blueprint, jsonify, request, current_app
-# ------------------------------------------------------------------------------
-# Configuration Constants
-# ------------------------------------------------------------------------------
-COHERE_API_URL = 'https://api.cohere.com/v2/chat'
-COHERE_MODEL = 'command-r-08-2024'
-# Token limits for different operations
-TOKEN_LIMITS = {
-  'validation': 5,
-  'answer_validation_detailed': 200,
-  'hint_generation': 250,
-  'question_generation': 1000
-}
-# Request timeouts (seconds)
-API_TIMEOUT = 30
-# Difficulty levels
-VALID_DIFFICULTIES = ['basic', 'intermediate', 'expert']
-# Validation response types
-VALIDATION_RESPONSES = ['Grammar', 'Not Grammar', 'ask grammar topics']
-# ------------------------------------------------------------------------------
-# Blueprint Setup
-# ------------------------------------------------------------------------------
-questions_bp = Blueprint('questions', __name__)
-# Configure logging
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-# ------------------------------------------------------------------------------
-# Helper Functions
-# ------------------------------------------------------------------------------
-def _get_cohere_headers() -> Optional[Dict[str, str]]:
-  """
-  Get Cohere API headers with authentication.
-  Prefers API key from Flask app config, falls back to environment variable.
-  Returns:
-      Dict containing Authorization and Content-Type headers, or None if key not found.
-  """
-  api_key = current_app.config.get('COHERE_API_KEY') or os.getenv('COHERE_API_KEY', '')
-  if not api_key:
-    logger.error('COHERE_API_KEY is not configured')
-    return None
-  return {
-    'Authorization': f'Bearer {api_key}',
-    'Content-Type': 'application/json',
-  }
-def _extract_text_from_cohere_v2_response(response_json: Dict[str, Any]) -> str:
-  """
-  Extract text content from Cohere API v2 response.
-  The v2 /chat endpoint returns:
-  {
-    "message": {
-      "content": [
-        {"type": "text", "text": "..."}
-      ]
-    }
-  }
-  Args:
-      response_json: The JSON response from Cohere API
-  Returns:
-      Extracted text content or empty string if not found
-  """
-  message = response_json.get('message', {})
-  content = message.get('content', [])
-  if isinstance(content, list) and content:
-    first_block = content[0]
-    if isinstance(first_block, dict):
-      return (first_block.get('text') or '').strip()
-  return ''
-def _call_cohere_api(prompt: str, max_tokens: int, temperature: float = 0.7) -> Optional[str]:
-  """
-  Make a call to Cohere API with standardized error handling.
-  Args:
-      prompt: The prompt to send to the AI
-      max_tokens: Maximum tokens for the response
-      temperature: Temperature for response generation (0.0-1.0)
-  Returns:
-      The AI response text, or None if an error occurred
-  """
-  headers = _get_cohere_headers()
-  if not headers:
-    logger.error('Cannot call Cohere API: headers not available')
-    return None
-  payload = {
-    'model': COHERE_MODEL,
-    'messages': [
-      {'role': 'user', 'content': prompt}
-    ],
-    'max_tokens': max_tokens,
-    'temperature': temperature
-  }
-  try:
-    response = requests.post(
-      COHERE_API_URL,
-      json=payload,
-      headers=headers,
-      timeout=API_TIMEOUT
-    )
-    if response.status_code == 200:
-      return _extract_text_from_cohere_v2_response(response.json())
-    else:
-      logger.error(
-        f'Cohere API returned status {response.status_code}: {response.text}'
-      )
-      return None
-  except requests.exceptions.Timeout:
-    logger.error(f'Cohere API request timed out after {API_TIMEOUT} seconds')
-    return None
-  except requests.exceptions.RequestException as e:
-    logger.error(f'Cohere API request failed: {str(e)}')
-    return None
-  except Exception as e:
-    logger.error(f'Unexpected error calling Cohere API: {str(e)}')
-    return None
-def _validate_input_length(text: str, max_length: int = 500) -> bool:
-  """
-  Validate that input text doesn't exceed maximum length.
-  Args:
-      text: Input text to validate
-      max_length: Maximum allowed length
-  Returns:
-      True if valid, False otherwise
-  """
-  return len(text.strip()) <= max_length
-def _get_question_generation_prompt(topic: str, difficulty: str) -> str:
-  """
-  Get the appropriate prompt for question generation based on difficulty.
-  Args:
-      topic: The grammar topic
-      difficulty: The difficulty level (basic, intermediate, expert)
-  Returns:
-      The formatted prompt string
-  """
-  if difficulty == 'basic':
-    return f"""
-Generate five **completely new and unique** very basic-level fill-in-the-blank grammar questions **every time** on the topic '{topic}'.
-### Rules:
-- Generate five unique fill-in-the-blank grammar questions based on the topic '{topic}'.
-- Each question must have exactly one blank represented by '_______' (not two blanks or underscores inside the sentence).
-- Each question must have a different theme for variety.
-- Use different sentence structures; avoid predictable patterns.
-- Avoid long words or abstract concepts.
-- Focus on the topic '{topic}', and ensure the blank is the key part of speech.
-- Each question must include the correct answer in parentheses at the end.
-- Do not include any explanations or instructions—only the five questions.
-"""
-  elif difficulty == 'intermediate':
-    return f"""
-Generate five **completely new and unique** intermediate-level fill-in-the-blank grammar questions **every time** on the topic '{topic}'.
-### Rules:
-- Generate five unique fill-in-the-blank grammar questions based on the topic '{topic}'.
-- Each question must have exactly one blank represented by '_______'.
-- Slightly more challenging than basic-level; use a wider range of sentence structures and vocabulary.
-- Each question must have a different theme.
-- Sentences should be longer and include more detail.
-- Focus on the topic '{topic}', and ensure the blank is the key part of speech.
-- Each question must include the correct answer in parentheses at the end.
-- Do not include any explanations or instructions—only the five questions.
-"""
-  else:  # expert
-    return f"""
-Generate five **completely new and unique** advanced-level (C1) fill-in-the-blank grammar questions **every time** on the topic '{topic}'.
-### Rules:
-- Generate five unique fill-in-the-blank grammar questions based on the topic '{topic}'.
-- Each question must have exactly one blank represented by '_______'.
-- More challenging than intermediate (C1); require expert-level mastery of grammar and context.
-- Ensure varied and sophisticated vocabulary; avoid basic words.
-- Each question should require nuanced comprehension; test advanced grammar patterns.
-- The blank must be the key part of the sentence (not an obvious answer).
-- Each question must include the correct answer in parentheses at the end.
-- Do not include any explanations or instructions—only the five questions.
-"""
-# ------------------------------------------------------------------------------
-# Core Functions
-# ------------------------------------------------------------------------------
-def validate_topic(topic: str) -> str:
-  """
-  Validate whether a given topic is related to English grammar.
-  Args:
-      topic: The topic to validate
-  Returns:
-      One of: 'Grammar', 'Not Grammar', 'ask grammar topics', or an error message
-  """
-  if not _validate_input_length(topic, max_length=200):
-    return 'Not Grammar'
-  validation_prompt = f"""
-You are a highly knowledgeable AI grammar expert. Your task is to evaluate whether the given topic relates to **English grammar** or not.
-**Input Topic:** "{topic}"
-### **Instructions:**
-- If the input **exactly refers to** grammar concepts (such as **parts of speech**, **verb tenses**, **sentence structure**, **grammar rules**, etc.), respond with `"Grammar"`.
-- If the input **seems to be a general question or concept** that is **not directly related to grammar**, such as general knowledge, science, history, or unrelated fields, respond with `"Not Grammar"`.
-- If the input is in the form of a **question** (e.g., "What is subject-verb agreement?"), respond with `"ask grammar topics"`.
-- If the topic refers to a **specific grammar concept** (e.g., **noun**, **verb**, **preposition**, **past tense**, etc.), always classify it as `"Grammar"`.
-- **Do not include any explanations or examples**. Your answer must only be `"Grammar"`, `"Not Grammar"`, or `"ask grammar topics"`, depending on whether the topic is relevant to grammar.
-- If the input is **unclear**, err on the side of classifying it as `"Not Grammar"` rather than `"Grammar"`.
-Your response must only be one of these three options:
-- `"Grammar"`
-- `"Not Grammar"`
-- `"ask grammar topics"`
-No extra text or explanation.
-"""
-  result = _call_cohere_api(
-    validation_prompt,
-    max_tokens=TOKEN_LIMITS['validation'],
-    temperature=0.3
-  )
-  if result is None:
-    return 'Error: Unable to validate topic'
-  if result not in VALIDATION_RESPONSES:
-    return 'Not Grammar'
-  return result
-def validate_single_answer(topic: str, question: str, user_answer: str) -> str:
-  """
-  Validate a single answer using AI.
-  Args:
-      topic: The grammar topic
-      question: The question being answered
-      user_answer: The user's answer
-  Returns:
-      Validation response from the AI
-  """
-  prompt = f"""
-You are a highly knowledgeable grammar assistant. Validate whether the user's answer to the following question is correct or not based on {topic}. If the answer is incorrect, provide a helpful hint.
-Topic: {topic}
-Question: "{question}"
-User's Answer: "{user_answer}"
-Is the answer correct? If not, please explain why and give a hint.
-"""
-  result = _call_cohere_api(
-    prompt,
-    max_tokens=TOKEN_LIMITS['answer_validation_detailed'],
-    temperature=0.7
-  )
-  if result is None:
-    return 'Error: Unable to validate answer'
-  return result
-def generate_hint(topic: str, question: str, user_answer: str) -> str:
-  """
-  Generate a helpful hint for an incorrect answer.
-  Args:
-      topic: The grammar topic
-      question: The question
-      user_answer: The user's incorrect answer
-  Returns:
-      A helpful hint without revealing the answer
-  """
-  prompt = f"""
-You are a highly skilled grammar assistant. Your task is to generate a helpful hint for the user to improve their answer based on the following question.
-Topic: {topic}
-Question: "{question}"
-User's Answer: "{user_answer}"
-If the user's answer is incorrect, provide a specific, actionable hint to help the user correct their answer.
-The hint should include:
-- Explanation of the error made by the user.
-- A hint on the correct grammatical structure or word form.
-- A hint on how to structure the sentence correctly **without revealing the exact answer**.
-Please make sure the hint is **clear** and **helpful** for the user, **without revealing the correct answer**.
-"""
-  result = _call_cohere_api(
-    prompt,
-    max_tokens=TOKEN_LIMITS['hint_generation'],
-    temperature=0.7
-  )
-  if result is None:
-    return 'Error: Unable to generate hint'
-  return result
-# ------------------------------------------------------------------------------
-# API Endpoints
-# ------------------------------------------------------------------------------
-@questions_bp.post('/generate-questions')
-def generate_questions():
-  """
-  Generate grammar questions based on topic and difficulty.
-  Expected JSON payload:
-  {
-    "topic": "string",
-    "difficulty": "basic" | "intermediate" | "expert"
-  }
-  Returns:
-      JSON response with generated questions or error message
-  """
-  try:
-    data = request.get_json()
-    if not data:
-      return jsonify({'error': 'Request body must be JSON'}), 400
-    # Extract and validate inputs
-    topic = data.get('topic', '').strip()
-    difficulty = data.get('difficulty', 'basic').lower()
-    if not topic:
-      return jsonify({'error': 'Topic is required'}), 400
-    if not _validate_input_length(topic, max_length=200):
-      return jsonify({'error': 'Topic exceeds maximum length of 200 characters'}), 400
-    if difficulty not in VALID_DIFFICULTIES:
-      return jsonify({
-        'error': f'Invalid difficulty level. Must be one of: {", ".join(VALID_DIFFICULTIES)}'
-      }), 400
-    # Validate topic is grammar-related
-    validation_result = validate_topic(topic)
-    if validation_result.startswith('Error:'):
-      logger.error(f'Topic validation error: {validation_result}')
-      return jsonify({'error': 'Unable to validate topic at this time'}), 500
-    if validation_result != 'Grammar':
-      return jsonify({
-        'message': 'Please enter a valid **grammar topic**, not a general word or unrelated question.'
-      }), 400
-    logger.info(f'Generating {difficulty} questions for topic: {topic}')
-    # Generate questions
-    prompt = _get_question_generation_prompt(topic, difficulty)
-    result = _call_cohere_api(
-      prompt,
-      max_tokens=TOKEN_LIMITS['question_generation'],
-      temperature=0.8
-    )
-    if result is None:
-      return jsonify({
-        'error': 'Failed to generate questions',
-        'details': 'Unable to reach AI service'
-      }), 500
-    return jsonify({'text': result}), 200
-  except Exception as e:
-    logger.exception(f'Unexpected error in generate_questions: {str(e)}')
-    return jsonify({'error': 'An unexpected error occurred'}), 500
-@questions_bp.post('/validate-all-answers')
-def validate_all_answers():
-  """
-  Validate multiple answers at once (batch validation).
-  Expected JSON payload:
-  {
-    "questions": [
-      {
-        "topic": "string",
-        "question": "string",
-        "user_answer": "string"
-      }
-    ]
-  }
-  Returns:
-      JSON response with validation results for all questions
-  """
-  try:
-    data = request.get_json()
-    if not data:
-      return jsonify({'error': 'Request body must be JSON'}), 400
-    questions = data.get('questions', [])
-    if not questions:
-      return jsonify({'error': 'No questions provided'}), 400
-    if not isinstance(questions, list):
-      return jsonify({'error': 'Questions must be an array'}), 400
-    if len(questions) > 50:
-      return jsonify({'error': 'Maximum 50 questions allowed per request'}), 400
-    validation_results = []
-    for item in questions:
-      if not isinstance(item, dict):
-        validation_results.append({
-          'error': 'Invalid question format'
-        })
-        continue
-      topic = item.get('topic', '').strip()
-      question = item.get('question', '').strip()
-      user_answer = item.get('user_answer', '').strip()
-      if not all([topic, question, user_answer]):
-        validation_results.append({
-          'question': question,
-          'error': 'Missing required fields (topic, question, or user_answer)'
-        })
-        continue
-      # Validate input lengths
-      if not _validate_input_length(topic, 200) or not _validate_input_length(question, 500) or not _validate_input_length(user_answer, 500):
-        validation_results.append({
-          'question': question,
-          'error': 'Input exceeds maximum length'
-        })
-        continue
-      # Validate the answer
-      validation_response = validate_single_answer(topic, question, user_answer)
-      # Generate hint if answer is incorrect
-      hint = None
-      if isinstance(validation_response, str) and (
-        'incorrect' in validation_response.lower() or
-        'not correct' in validation_response.lower()
-      ):
-        hint = generate_hint(topic, question, user_answer)
-      validation_results.append({
-        'question': question,
-        'user_answer': user_answer,
-        'validation_response': validation_response,
-        'hint': hint
-      })
-    return jsonify({'results': validation_results}), 200
-  except Exception as e:
-    logger.exception(f'Unexpected error in validate_all_answers: {str(e)}')
-    return jsonify({'error': 'An unexpected error occurred'}), 500
-# ------------------------------------------------------------------------------
-# Health Check
-# ------------------------------------------------------------------------------
-@questions_bp.get('/health')
-def health():
-  """
-  Health check endpoint for the questions service.
-  Returns:
-      JSON response indicating service status
-  """
-  return jsonify({
-    'status': 'healthy',
-    'service': 'grammar-questions',
-    'cohere_api_configured': bool(_get_cohere_headers())
-  }), 200

googlecredentails.json DELETED Viewed

@@ -1,13 +0,0 @@
-{
-  "type": "service_account",
-  "project_id": "map-pykara-1551704617990",
-  "private_key_id": "a124b056c0d611f0b5845f343d1210e8c2bad0fc",
-  "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDHsBOK9V61bKle\niq/2P/fNJ51JcLvi0xcpkQTtpvaFfEX+b6ACE6rOQf1M8+s3oCXENn7MN8XXUyuj\nGMPUPnXSOujuEA2d+0oi7bUifkucPNhbBqHcymp3XM8tt6/75Vfc0avXapHDe2td\nyEQ8WoisbzvdPzy7r1v//8aeEQ10gKZiKDWqVWXdyrNq48zVbMpdwWmLJm92aFpl\nIRStypAerewPZaNF7qACvVwHMXh6bIebr8gPg8gyOTnf5MVu80esW1CC2DSwX5SF\nIRjNcT7Yrb/O0s8awgULLBcyPp/4LpWjJP/l79Bu3+tm2NPU7Qkc55Q3PIA01Fr0\nJeZPXMpLAgMBAAECggEAM/kqpPzHOT4+ePps4RF2LEH2yLIcXOWnPizeFQLjYAvf\n5eDkyxWWW9fGF1zXKGO44LB0YS/VAP3HOkyMT7YwEVc+4BNyj99jROMMxZ0Mje4y\nO8LmpyJSAp432ETae5wOOc2ixc8ZgEEjyUWCKRlBQGw1Lxkx9AGo1uEaL3Ltxxfs\ns/JzY4i2gVVqoCD7dUSFWt7rnjTm0LXz+cQlCMSJVOnpj3rMhSCGsj0dkJusW+BP\nF59yjhNVCs92MS5VidU/Ud8XDjbLzaSdsXATTZ5UGFBnSARNqqa921jBdkgS5/9d\n+KY13w7Se28lkDgR7EvSTCXdWjTvcA+yAW3/4frQyQKBgQD5VNOLwtUOzH5zh8tF\nRye103zqRvag1cR2CeyqaEkBCznVVKDfVvgkTM0XqxwA1p4bQjMvTLmdntI1ubjd\ndcS+t0042xevqEeIfkTA4bB8QsdODDCxWPhmH4HwXMahbfCpm5gAdGalf6qcDt0j\n/lW0S2WowXC2yVzB3hc2tGHpgwKBgQDNB1cXBzRPBcXYcX+JObMh6+IkWHozpw9D\nryaJISAgBF/MW9ZVEexkmhBbcj1b+MWfGa/U4gIXks1RkOiPyn6ywL7tDOLV1+tV\nOlG/dWepWtfuHkdaLDBIhnxykGDqpU/0Y4R3JrmofO1r3lc/uPgUUltVg3bNLerI\nqtb2vnDpmQKBgC4pvHk1+4if6BGv5LzQ7dNGcuxVczhYG+XW9JCgelPNJkoPPzHa\nwlrGNXraXXbyRZe6bAun4v4B992mo0mtkl3VRmDuf7YwK/5jkos7vhdjrc8Phrxv\nQp512vML2mLtHg/pFP2Qj6i4uHfocJ1Ha8rT4uCZ4CqXoarrWdTxFOfNAoGAFwjj\nFPg/PT2Vy8p8nKs53+7DenfiStlTErSj7LYnCNHU/X2359jaqTbR7aQ5FpMtiMF3\nCsDVoVZh8O8J4dXLREP5b2KKPaJDk1C5DHyhR9qn9d27AHuEdTF+29Qyv0oRYJCp\nukVEiJR4jCzvun4KiSXzkvjxKP4mqaLgAdrFjskCgYEArF/mdtBotpOtI22CrWQ2\nG9kKR9USSHik39lj8thANirF/jLdcEea0c/WvLE7tcuJqcJ0hhGZVtoiKVWDyTTJ\nncwRdGHGCau5p5a1gWca/NgXGhUnq3X6AehUcBu4xJnP2Y/PMiAxiBWBRw08ZyNk\nQUyDANxdQVM9B0R8sqPbwDM=\n-----END PRIVATE KEY-----\n",
-  "client_email": "learnenglishai@map-pykara-1551704617990.iam.gserviceaccount.com",
-  "client_id": "106031173963438453050",
-  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
-  "token_uri": "https://oauth2.googleapis.com/token",
-  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
-  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/learnenglishai%40map-pykara-1551704617990.iam.gserviceaccount.com",
-  "universe_domain": "googleapis.com"
-}

listen.py DELETED Viewed

@@ -1,436 +0,0 @@
-# listen.py
-from flask import Flask, Blueprint, jsonify, send_file, abort, request, send_from_directory
-from flask_cors import CORS
-from moviepy.editor import VideoFileClip
-from google.cloud import speech
-import os
-print(f"GOOGLE_APPLICATION_CREDENTIALS: {os.getenv('GOOGLE_APPLICATION_CREDENTIALS')}")
-import uuid
-import requests
-from pydub import AudioSegment
-import ffmpeg
-import re
-import io  # for streaming S3 bytes in HF/AWS mode
-import json  # <-- added for JSON creds parsing
-# Optional (only used in AWS mode)
-try:
-    import boto3
-    from botocore.exceptions import BotoCoreError, ClientError
-except Exception:
-    boto3 = None
-    BotoCoreError = ClientError = Exception
-# ---------- Blueprint ----------
-listen_bp = Blueprint("listen", __name__)
-# ---------------------- storage mode helpers ----------------------
-def _is_aws_video_mode() -> bool:
-    """
-    Switch to S3 on Hugging Face / prod. Local stays on disk.
-    """
-    if os.getenv("USE_AWS_VIDEO", "0") == "1":
-        return True
-    if os.getenv("SPACE_ID"):  # set on Hugging Face Spaces
-        return True
-    if os.getenv("ENV", "dev").lower() == "prod":
-        return True
-    return False
-def _s3_clients():
-    if boto3 is None:
-        raise RuntimeError("boto3 is required in AWS video mode but not available")
-    region = os.getenv("AWS_DEFAULT_REGION", "eu-north-1")
-    s3 = boto3.client("s3", region_name=region)
-    return s3
-def _video_s3_bucket():
-    bucket = os.getenv("S3_BUCKET_NAME")
-    if not bucket:
-        raise RuntimeError("S3_BUCKET_NAME is not set")
-    return bucket
-def _video_s3_key(filename: str) -> str:
-    # Prefix under which listen.py stores videos in the same bucket
-    prefix = os.getenv("LISTEN_S3_PREFIX", "listen")
-    prefix = prefix.strip().strip("/")
-    return f"{prefix}/{filename}"
-# ---------- writable working directories ----------
-# Base working dir: /tmp on HF/AWS; local stays under ./static (or override via LISTEN_WORKDIR)
-_BASE_WORKDIR = os.getenv(
-    "LISTEN_WORKDIR",
-    "/tmp/listen" if _is_aws_video_mode() else os.path.abspath("static")
-)
-VIDEO_FOLDER = os.path.join(_BASE_WORKDIR, "videos")
-AUDIO_FOLDER = os.path.join(_BASE_WORKDIR, "audio")
-TRANSCRIPT_FOLDER = os.path.join(_BASE_WORKDIR, "transcripts")
-# Ensure directories exist (with hard fallback to /tmp if needed)
-for _pname in ("videos", "audio", "transcripts"):
-    _p = os.path.join(_BASE_WORKDIR, _pname)
-    try:
-        os.makedirs(_p, exist_ok=True)
-    except Exception:
-        _fallback_base = "/tmp/listen"
-        os.makedirs(os.path.join(_fallback_base, _pname), exist_ok=True)
-        if _pname == "videos":
-            VIDEO_FOLDER = os.path.join(_fallback_base, "videos")
-        elif _pname == "audio":
-            AUDIO_FOLDER = os.path.join(_fallback_base, "audio")
-        else:
-            TRANSCRIPT_FOLDER = os.path.join(_fallback_base, "transcripts")
-# ---------------- Cohere configuration (migrated to v2 Chat) ----------------
-COHERE_API_KEY = os.getenv("COHERE_API_KEY", "")
-COHERE_API_URL = 'https://api.cohere.com/v2/chat'
-# ---------------------------------------------------------------------------
-# --- Google Cloud Speech-to-Text client init (prefers HF secret JSON) ---
-def _make_speech_client():
-    sa_json = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
-    if sa_json:
-        try:
-            info = json.loads(sa_json)
-            return speech.SpeechClient.from_service_account_info(info)
-        except Exception as e:
-            print(f"Failed to parse GOOGLE_APPLICATION_CREDENTIALS_JSON: {e}")
-            # fall through to default ADC
-    return speech.SpeechClient()
-speech_client = _make_speech_client()
-# -------------------------------------------------------------------------
-# ------------- Cohere v2 helper (extract text from chat response) -------------
-def _extract_text_v2(resp_json: dict) -> str:
-    """
-    Cohere v2 /chat returns:
-      { "message": { "content": [ { "type": "text", "text": "..." }, ... ] } }
-    This pulls the first text block.
-    """
-    msg = resp_json.get("message", {})
-    content = msg.get("content", [])
-    for block in content:
-        if isinstance(block, dict) and block.get("type") == "text":
-            text = (block.get("text") or "").strip()
-            if text:
-                return text
-    return ""
-# -----------------------------------------------------------------------------
-# Convert video to audio
-def convert_video_to_audio(video_path, audio_path):
-    try:
-        # Using moviepy to extract audio from video
-        video = VideoFileClip(video_path)
-        video.audio.write_audiofile(audio_path, codec='mp3')
-        return audio_path
-    except Exception as e:
-        print(f"Error converting video to audio: {str(e)}")
-        return None
-# Re-encode MP3 to ensure proper format
-def reencode_mp3(input_audio_path, output_audio_path):
-    try:
-        # Using pydub to convert and re-encode MP3 (ensuring correct encoding)
-        audio = AudioSegment.from_mp3(input_audio_path)
-        audio.export(output_audio_path, format="mp3", codec="libmp3lame", parameters=["-q:a", "0"])
-        return output_audio_path
-    except Exception as e:
-        print(f"Error re-encoding MP3: {str(e)}")
-        return None
-# Helper function to convert audio to the proper MP3 encoding if necessary
-def convert_audio_to_mp3(input_file_path, output_file_path):
-    """
-    Converts the audio file to a valid MP3 format with proper encoding.
-    """
-    try:
-        ffmpeg.input(input_file_path).output(output_file_path, acodec='libmp3lame', audio_bitrate='128k').run()
-        return True
-    except Exception as e:
-        print(f"Error during audio conversion: {e}")
-        return False
-# Function to compress audio dynamically
-def compress_audio(input_file_path, output_file_path, target_bitrate="128k"):
-    audio = AudioSegment.from_file(input_file_path)
-    audio.export(output_file_path, format="mp3", bitrate=target_bitrate)
-    return output_file_path
-# ---------------------------- Routes (Blueprint) ----------------------------
-@listen_bp.route('/', methods=['GET'])
-def home():
-    return "Welcome to the Flask app! The server is running."
-@listen_bp.route('/videos', methods=['GET'])
-def list_videos():
-    """
-    List available videos for users to watch.
-    """
-    # If you maintain a VIDEOS list elsewhere, return it here.
-    # Returning empty list so the endpoint stays valid.
-    return jsonify([]), 200
-@listen_bp.route('/videos/<filename>')
-def serve_video(filename):
-    """
-    Local: serve file from disk.
-    HF/AWS: fetch object from S3 and stream bytes (no redirect).
-    """
-    if _is_aws_video_mode():
-        try:
-            s3 = _s3_clients()
-            bucket = _video_s3_bucket()
-            key = _video_s3_key(filename)
-            obj = s3.get_object(Bucket=bucket, Key=key)
-            data = obj["Body"].read()
-            return send_file(
-                io.BytesIO(data),
-                mimetype="video/mp4",
-                download_name=filename,
-                as_attachment=False
-            )
-        except (BotoCoreError, ClientError, Exception) as e:
-            print(f"S3 fetch failed for {filename}: {e}")
-            abort(404)
-    # Local
-    video_path = os.path.join(VIDEO_FOLDER, filename)
-    if not os.path.exists(video_path):
-        print(f"Video file not found: {filename}")
-        abort(404)
-    return send_file(video_path, mimetype='video/mp4')
-@listen_bp.route('/upload-video', methods=['POST'])
-def upload_video():
-    """
-    Local: save to static/videos or /tmp/listen/videos (depending on mode).
-    HF/AWS: upload to S3 (no local original).
-    """
-    print("Received upload request.")
-    if 'video' not in request.files:
-        print("No video file provided in the request.")
-        return jsonify({'error': 'No video file provided'}), 400
-    video = request.files['video']
-    if video.filename == '':
-        print("Empty filename detected.")
-        return jsonify({'error': 'No selected file'}), 400
-    try:
-        filename = f"{uuid.uuid4()}.mp4"
-        if _is_aws_video_mode():
-            try:
-                s3 = _s3_clients()
-                bucket = _video_s3_bucket()
-                key = _video_s3_key(filename)
-                s3.put_object(
-                    Bucket=bucket,
-                    Key=key,
-                    Body=video.stream.read(),
-                    ContentType="video/mp4"
-                )
-                print(f"Uploaded to S3: s3://{bucket}/{key}")
-            except (BotoCoreError, ClientError, Exception) as e:
-                print(f"S3 upload error: {e}")
-                return jsonify({'error': 'Failed to upload to S3'}), 500
-        else:
-            # Save locally
-            video_path = os.path.join(VIDEO_FOLDER, filename)
-            print(f"Saving video: {filename}")
-            video.save(video_path)
-            print(f"Video saved successfully at {video_path}")
-        return jsonify({'message': 'Video uploaded successfully!', 'filename': filename}), 200
-    except Exception as e:
-        print(f"Error saving video: {str(e)}")
-        return jsonify({'error': 'Failed to save video'}), 500
-@listen_bp.route('/generate-questions-dynamicvideo', methods=['POST'])
-def generate_questions():
-    try:
-        data = request.json
-        video_filename = data.get('filename')
-        if not video_filename:
-            print("Error: No filename provided in request.")
-            return jsonify({"error": "Filename is required"}), 400
-        # Resolve a local readable path for processing
-        video_path = os.path.join(VIDEO_FOLDER, video_filename)
-        if _is_aws_video_mode():
-            # Download object bytes to a local working file path
-            try:
-                s3 = _s3_clients()
-                bucket = _video_s3_bucket()
-                key = _video_s3_key(video_filename)
-                obj = s3.get_object(Bucket=bucket, Key=key)
-                data_bytes = obj["Body"].read()
-                with open(video_path, "wb") as f:
-                    f.write(data_bytes)
-            except (BotoCoreError, ClientError, Exception) as e:
-                print(f"S3 download error for {video_filename}: {e}")
-                return jsonify({"error": "Video file not found"}), 404
-        else:
-            if not os.path.exists(video_path):
-                print(f"Error: Video file {video_filename} not found at {video_path}")
-                return jsonify({"error": "Video file not found"}), 404
-        print(f"Processing video: {video_filename}")
-        # Convert video to audio
-        audio_filename = f"{uuid.uuid4()}.mp3"
-        audio_path = os.path.join(AUDIO_FOLDER, audio_filename)
-        if not convert_video_to_audio(video_path, audio_path):
-            print("Error: Video to audio conversion failed.")
-            return jsonify({"error": "Failed to convert video to audio"}), 500
-        # Transcribe audio using Google Cloud Speech-to-Text
-        with open(audio_path, 'rb') as audio_file:
-            audio_content = audio_file.read()
-        audio = speech.RecognitionAudio(content=audio_content)
-        config = speech.RecognitionConfig(
-            encoding=speech.RecognitionConfig.AudioEncoding.MP3,
-            sample_rate_hertz=16000,
-            language_code="en-US",
-        )
-        response = speech_client.recognize(config=config, audio=audio)
-        transcripts = [result.alternatives[0].transcript for result in response.results]
-        if not transcripts:
-            print("Error: No transcription results found.")
-            return jsonify({"error": "No transcription results found"}), 500
-        transcription_text = " ".join(transcripts)
-        print(f"Transcription successful: {transcription_text[:200]}...")  # Print first 200 chars
-        # ---------------- Cohere v2 Chat call (minimal change) ----------------
-        headers = {
-            "Authorization": f"Bearer {COHERE_API_KEY}",
-            "Content-Type": "application/json"
-        }
-        prompt_text = (
-            "Generate exactly three multiple-choice questions based on this text:\n"
-            f"{transcription_text}\n\n"
-            "Rules:\n"
-            "- Each question starts with a number and a period (e.g., 1.)\n"
-            "- Each question has exactly four options labeled A., B., C., and D.\n"
-            "- After the options, add a line 'Correct answer: <A|B|C|D>'\n"
-            "- Output plain text only."
-        )
-        cohere_payload = {
-            "model": "command-r-08-2024",
-            "messages": [
-                {"role": "user", "content": prompt_text}
-            ],
-            "max_tokens": 300,
-            "temperature": 0.9
-        }
-        cohere_response = requests.post(
-            COHERE_API_URL,
-            json=cohere_payload,
-            headers=headers,
-            timeout=60
-        )
-        if cohere_response.status_code != 200:
-            print(f"Error: Cohere API response failed: {cohere_response.text}")
-            return jsonify({"error": "Failed to generate questions"}), 500
-        raw_text = _extract_text_v2(cohere_response.json())
-        if not raw_text:
-            print("Error: No questions text returned by Cohere Chat API.")
-            return jsonify({"error": "No questions generated"}), 500
-        # ---------------------------------------------------------------------
-        # Extract raw text and parse questions
-        structured_questions = parse_questions(raw_text)
-        return jsonify({"questions": structured_questions}), 200
-    except Exception as e:
-        print(f"Critical Error: {e}")
-        return jsonify({"error": "An error occurred while generating questions"}), 500
-def parse_questions(response_text):
-    # Split the text into individual question blocks
-    question_blocks = response_text.split("\n\n")
-    questions = []
-    # Process each question block
-    for block in question_blocks:
-        print("\nProcessing Block:", block)  # Debug: Log each question block
-        # Split the block into lines
-        lines = block.strip().split("\n")
-        print("Split Lines:", lines)  # Debug: Log split lines of the block
-        # Ensure the block contains a question
-        if len(lines) < 2:
-            print("Skipping Invalid Block")  # Debug: Log invalid blocks
-            continue
-        # Extract the question text
-        question_line = lines[0]
-        question_text = question_line.split(". ", 1)[1] if ". " in question_line else question_line
-        print("Question Text:", question_text)  # Debug: Log extracted question text
-        # Extract the options and find the correct answer
-        options = []
-        correct_answer_letter = None
-        for line in lines[1:]:
-            line = line.strip()
-            # Handle A., B., C., D. and also a) / A) formats
-            if line.lower().startswith("correct answer:"):
-                correct_answer_letter = line.split(":")[-1].strip()
-                continue
-            match = re.match(r"^(?:[a-dA-D][\).]?\s)?(.+)$", line)
-            if match:
-                option_text = match.group(1).strip()
-                # We already handled "Correct answer:" above, so only options get appended
-                if not line.lower().startswith("correct answer:"):
-                    options.append(option_text)
-        print("Extracted Options:", options)  # Debug: Log extracted options
-        print("Correct Answer Letter:", correct_answer_letter)  # Debug: Log the correct answer letter
-        # Map the correct answer text
-        correct_answer_text = ""
-        if correct_answer_letter:
-            option_index = ord(correct_answer_letter.upper()) - ord('A')  # Convert 'A'→0, 'B'→1, etc.
-            if 0 <= option_index < len(options):
-                correct_answer_text = options[option_index]
-        print("Mapped Correct Answer Text:", correct_answer_text)  # Debug: Log mapped answer
-        # Append the parsed question to the list
-        if question_text and options:
-            questions.append({
-                "question": question_text,
-                "options": options,
-                "answer": correct_answer_text  # Use the full answer text
-            })
-    print("\nFinal Questions:", questions)  # Debug: Log final parsed questions
-    return questions
-# ---------- Standalone (local testing) ----------
-if __name__ == '__main__':
-    app = Flask(__name__)
-    CORS(app)
-    app.config["COHERE_API_KEY"] = os.getenv("COHERE_API_KEY", COHERE_API_KEY)
-    app.register_blueprint(listen_bp, url_prefix='')
-    app.run(host='0.0.0.0', port=5012, debug=True)

media/audio/explain_1112505a6701429cb241d131a88bf709.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0f7aae706a5bc1c25e9cf61ddc970ab3d0454650c14a936c7da051556c057091
-size 1951916

media/audio/explain_5c2a7427d1f14a2aa9fa9e59bb1ad603.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:af65a4273dbbcb6aa004215de1f58b1fd964bcdb1df04ce10a0a872a920b29c5
-size 634956

media/audio/explain_975ae1b5996743f6b76b5016f17056de.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7c54cd66b876ad6ab7ad9b7420fd38e2aa80e625e5d34be2fb3ef9d96461ddff
-size 503372

media/audio/explain_ca92720c882d4926973973aa4b9f2316.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:14a428d11bba10b2f51c72826a9339dde62189153473796269b0fd7a09f27c54
-size 193612

media/audio/explain_cc24a21b0b374e50bc8afbf73a7398c4.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a1dff985cf893840190f1f1e8229e99c80ed5651035f820c1590e39690fc009f
-size 175692

media/audio/explain_dd70fb52325d44fc84cde7c1c9215232.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e37bd2a66186f0e2ca5ee4a02b0a9b63977af13f9bf5bb32f006f8a2066edcf7
-size 470092

media/audio/synth_22ebf1e3b9404b34a41b2fdc2c691adb.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7137826339f483af77865b9dfd96c2386311e5cfcc52ca5990a011fabdd12fab
-size 1287340

media/audio/synth_2757240115da4ba3a9aa1286aee57db9.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:675814fca44682416fa92128edc0b7637c3afdeeea7043f7a167ce36f4ac4a01
-size 676972

media/audio/synth_4965badeb7da43ffac0c3a7af781ab0f.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bab5cb8d0e45d587f484120aaad8eefb100a757ab3c91b8909afedbc199ce106
-size 157772

media/audio/synth_7bccf943f0b24880b77aa038b38f8bf1.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2914d7c32525eaf3ab30bedbca0a1dc3a9d1145dad34a0c45eb13f28d67f3d7e
-size 465484

chroma_db/1ceaf3a3-30e6-42c4-b515-99a05466da04/data_level0.bin → media/audio/synth_d38b265fcd6d4f9cbb825007c3f52ac5.wav RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:90b564d60a2658c07a41e1133109c1574bb40f6ab674750bba8b8eeb28a08f25
-size 167600

 version https://git-lfs.github.com/spec/v1
+oid sha256:fe2266670758acffea3818641f235780f4609418c58e5b6065ae48a22d02a870
+size 483404

media/audio/synth_ee1e3e992d6641b9a06d214e0e67ea92.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8747b294606c0766d55d6d24adc2c5ace29259c1f1969bae781f92e25dfb456f
-size 505932

pdfs/testing.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b85c06e93333ac99d33ffb8b4f9a4d8402c26ce5b323398bb6691b2f58acee64
-size 7352882

pron.py DELETED Viewed

@@ -1,729 +0,0 @@
-"""
-Pronunciation Trainer – Final Version
-Real IPA • Whisper small.en • Phoneme Substitution Detection
-Dynamic Feedback System for Children & Adults
-"""
-import os
-import io
-import re
-import uuid
-import tempfile
-import numpy as np
-import librosa
-from flask import Blueprint, request, jsonify, send_file
-from difflib import SequenceMatcher
-from werkzeug.utils import secure_filename
-from pydub import AudioSegment
-from pathlib import Path
-# -------------------------------------------------------------------------
-# IMPORTANT: Patch torch.load so XTTS can load on PyTorch 2.6 (HF Space)
-# -------------------------------------------------------------------------
-import torch
-_original_torch_load = torch.load
-def _torch_load_allow_weights(*args, **kwargs):
-    """
-    Global patch: force weights_only=False for all torch.load calls.
-    This follows option (1) from the PyTorch warning and is safe here
-    because we trust the XTTS checkpoint.
-    """
-    # Always override to False, regardless of what is passed
-    kwargs["weights_only"] = False
-    return _original_torch_load(*args, **kwargs)
-torch.load = _torch_load_allow_weights
-print(">>> [PRON] Patched torch.load to use weights_only=False for XTTS.", flush=True)
-# Use the same XTTS helper that already works in ragg
-from ragg.tts import xtts_speak_to_file
-# -------------------------------------------------------------------------
-# OPTIONAL MODULES
-# -------------------------------------------------------------------------
-try:
-    import whisper
-    WHISPER_AVAILABLE = True
-    WHISPER_MODEL = None
-    def get_whisper():
-        global WHISPER_MODEL
-        if WHISPER_MODEL is None:
-            # Use small.en as requested
-            WHISPER_MODEL = whisper.load_model("small.en")
-        return WHISPER_MODEL
-except Exception:
-    WHISPER_AVAILABLE = False
-try:
-    from phonemizer import phonemize
-    PHONEMIZER_AVAILABLE = True
-except Exception:
-    PHONEMIZER_AVAILABLE = False
-# -------------------------------------------------------------------------
-# PATHS
-# -------------------------------------------------------------------------
-BASE = os.path.dirname(os.path.abspath(__file__))
-STATIC_DIR = os.path.join(BASE, "static")
-AUDIO_DIR = os.path.join(STATIC_DIR, "audio")
-REF_DIR = os.path.join(STATIC_DIR, "references")
-os.makedirs(AUDIO_DIR, exist_ok=True)
-os.makedirs(REF_DIR, exist_ok=True)
-# Use the same base/trim logic as in ragg/tts.py
-BASE_DIR = Path(__file__).resolve().parent.parent
-XTTS_REF_DIR = Path(os.getenv("XTTS_REF_DIR", str(BASE_DIR / "trim")))
-# Optional local default reference under this blueprint
-DEFAULT_REFERENCE = Path(REF_DIR) / "voice1.wav"
-pron_bp = Blueprint("pron", __name__)
-# -------------------------------------------------------------------------
-# HELPERS
-# -------------------------------------------------------------------------
-def normalize(text):
-    if not text:
-        return ""
-    text = text.lower().strip()
-    text = re.sub(r"[^a-z ]", "", text)
-    return text.strip()
-def read_numpy(file, sr=16000):
-    file.stream.seek(0)
-    raw = file.stream.read()
-    b = io.BytesIO(raw)
-    ext = os.path.splitext(file.filename)[1].replace(".", "") or "wav"
-    try:
-        audio = AudioSegment.from_file(b, format=ext)
-    except Exception:
-        b.seek(0)
-        audio = AudioSegment.from_file(b)
-    audio = audio.set_channels(1).set_frame_rate(sr)
-    arr = np.array(audio.get_array_of_samples(), dtype=np.float32)
-    max_val = float(1 << (audio.sample_width * 8 - 1))
-    return arr / max_val, sr
-def detect_silence(y, sr):
-    if y is None or len(y) == 0:
-        return True, "no_audio"
-    duration = len(y) / sr
-    max_amp = np.max(np.abs(y))
-    if duration < 0.3:
-        return True, "too_short"
-    if max_amp < 0.015:
-        return True, "too_quiet"
-    return False, None
-def _make_suggestion_payload(message):
-    """
-    Small helper to create suggestion/feedback arrays so frontend always receives
-    structured feedback even on error paths.
-    """
-    return [{"title": "Notice", "message": message}]
-def error_response(error_key, message, status=400, extra=None):
-    payload = {
-        "error": error_key,
-        "message": message,
-        "suggestion": _make_suggestion_payload(message),
-        "feedback": _make_suggestion_payload(message),
-    }
-    if extra:
-        payload.update(extra)
-    return jsonify(payload), status
-def structured_feedback_error(error_key, message, extra=None, status=200):
-    """
-    Return a structured JSON payload that frontends can always bind to.
-    Used for user-facing ASR/validation issues (not server failures).
-    """
-    payload = {
-        "error": error_key,
-        "message": message,
-        "silent": False,
-        "word": None,
-        "heard_word": None,
-        "phoneme_teacher": None,
-        "phoneme_student": None,
-        "phoneme_similarity": 0.0,
-        "phonemeSimilarity": 0.0,
-        "phoneme_score": 0.0,
-        "phonemeScore": 0.0,
-        "feedback": _make_suggestion_payload(message),
-        "suggestion": _make_suggestion_payload(message),
-        "audio_url": None,
-    }
-    if extra:
-        payload.update(extra)
-    return jsonify(payload), status
-# -------------------------------------------------------------------------
-# REAL IPA PHONEMES
-# -------------------------------------------------------------------------
-def ipa_phonemes(text):
-    if not text:
-        return ""
-    if PHONEMIZER_AVAILABLE:
-        try:
-            ipa = phonemize(
-                text,
-                language="en-us",
-                backend="espeak",
-                strip=True,
-                preserve_punctuation=False,
-                ipa=True,
-                with_stress=True,
-            )
-            ipa = ipa.replace("ˈ", " ˈ").replace("ˌ", " ˌ")
-            return " ".join(ipa.split())
-        except Exception:
-            return text
-    return text
-# -------------------------------------------------------------------------
-# ASR OVERRIDE FOR SHORT WORDS
-# -------------------------------------------------------------------------
-def strong_word_match(word, heard, teacher_ph, student_ph):
-    ws = SequenceMatcher(None, heard, word).ratio()
-    ps = SequenceMatcher(None, teacher_ph, student_ph).ratio()
-    if ps >= 0.80:
-        return True
-    teacher_split = teacher_ph.split()
-    student_split = student_ph.split()
-    if teacher_split and student_split and teacher_split[0] == student_split[0]:
-        return True
-    if len(word) <= 5 and ws >= 0.60:
-        return True
-    return False
-# -------------------------------------------------------------------------
-# TTS (Teacher Voice) – using shared xtts_speak_to_file
-# -------------------------------------------------------------------------
-def clone_voice(text, out_path, reference: Path | str | None = None):
-    """
-    Generate teacher audio for 'text' into out_path using XTTS.
-    Priority:
-      1) Uploaded reference file.
-      2) DEFAULT_REFERENCE (static/references/voice1.wav).
-      3) Finally, XTTS_REF_DIR folder (trim) if nothing else is available.
-    """
-    # 1) explicit reference from caller
-    if reference is not None:
-        ref_path = Path(str(reference))
-        if ref_path.is_file():
-            return xtts_speak_to_file(
-                text=text,
-                out_file=out_path,
-                reference_files=[ref_path],
-                language="en",
-            )
-    # 2) default local reference
-    if DEFAULT_REFERENCE.is_file():
-        return xtts_speak_to_file(
-            text=text,
-            out_file=out_path,
-            reference_files=[DEFAULT_REFERENCE],
-            language="en",
-        )
-    # 3) fallback to XTTS_REF_DIR / trim as in RAG part
-    return xtts_speak_to_file(
-        text=text,
-        out_file=out_path,
-        reference_dir=XTTS_REF_DIR,
-        language="en",
-    )
-def clone_voice_bytes(text, reference: Path | str | None = None):
-    """
-    Generate teacher audio for 'text' and return raw bytes.
-    """
-    tmp_path = Path(tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name)
-    try:
-        clone_voice(text, tmp_path, reference=reference)
-        with open(tmp_path, "rb") as f:
-            data = f.read()
-    finally:
-        try:
-            tmp_path.unlink()
-        except Exception:
-            pass
-    return data
-# -------------------------------------------------------------------------
-# WAVEFORM / SPECTROGRAM HELPERS
-# -------------------------------------------------------------------------
-def load_audio_from_bytes(data_bytes: bytes, sr=16000):
-    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    try:
-        tmp.write(data_bytes)
-        tmp.flush()
-        tmp.close()
-        y, sr_loaded = librosa.load(tmp.name, sr=sr, mono=True)
-    finally:
-        try:
-            os.remove(tmp.name)
-        except Exception:
-            pass
-    return y, sr_loaded
-def compute_waveform_similarity(y_ref, y_stud, sr=16000):
-    result = {
-        "similarity": 0.0,
-        "dtw_dist": None,
-        "dtw_norm": None,
-        "dtw_sim": None,
-        "corr": None,
-        "corr_sim": None,
-    }
-    try:
-        y_ref_trim, _ = librosa.effects.trim(y_ref, top_db=20)
-    except Exception:
-        y_ref_trim = y_ref
-    try:
-        y_stud_trim, _ = librosa.effects.trim(y_stud, top_db=20)
-    except Exception:
-        y_stud_trim = y_stud
-    if y_ref_trim is None or y_stud_trim is None or len(y_ref_trim) < 10 or len(y_stud_trim) < 10:
-        return result
-    try:
-        mfcc_ref = librosa.feature.mfcc(y_ref_trim, sr=sr, n_mfcc=13)
-        mfcc_stud = librosa.feature.mfcc(y_stud_trim, sr=sr, n_mfcc=13)
-        D, wp = librosa.sequence.dtw(X=mfcc_ref, Y=mfcc_stud, metric="euclidean")
-        dtw_dist = float(D[-1, -1])
-        denom = (mfcc_ref.shape[1] + mfcc_stud.shape[1]) if (mfcc_ref.shape[1] + mfcc_stud.shape[1]) > 0 else 1.0
-        dtw_norm = dtw_dist / denom
-        dtw_sim = max(0.0, 100.0 - dtw_norm * 30.0)
-        result["dtw_dist"] = dtw_dist
-        result["dtw_norm"] = dtw_norm
-        result["dtw_sim"] = max(0.0, min(100.0, dtw_sim))
-    except Exception:
-        result["dtw_dist"] = None
-        result["dtw_norm"] = None
-        result["dtw_sim"] = 0.0
-    try:
-        min_len = min(len(y_ref_trim), len(y_stud_trim))
-        if min_len <= 1:
-            corr = 0.0
-        else:
-            r = y_ref_trim[:min_len]
-            s = y_stud_trim[:min_len]
-            r = (r - np.mean(r)) / (np.std(r) + 1e-9)
-            s = (s - np.mean(s)) / (np.std(s) + 1e-9)
-            corr = float(np.corrcoef(r, s)[0, 1])
-            if np.isnan(corr):
-                corr = 0.0
-        corr_sim = ((corr + 1.0) / 2.0) * 100.0
-        result["corr"] = corr
-        result["corr_sim"] = max(0.0, min(100.0, corr_sim))
-    except Exception:
-        result["corr"] = None
-        result["corr_sim"] = 0.0
-    dtw_component = float(result["dtw_sim"] or 0.0)
-    corr_component = float(result["corr_sim"] or 0.0)
-    combined = 0.65 * dtw_component + 0.35 * corr_component
-    result["similarity"] = round(float(max(0.0, min(100.0, combined))), 2)
-    return result
-def build_waveform_feedback(word: str, sim_dict: dict, threshold: float):
-    score = float(sim_dict.get("similarity") or 0.0)
-    dtw_sim = float(sim_dict.get("dtw_sim") or 0.0)
-    corr_sim = float(sim_dict.get("corr_sim") or 0.0)
-    feedback = []
-    if score >= 90:
-        feedback.append({
-            "title": "Overall Pronunciation",
-            "message": f"Excellent. Your waveform for '{word}' is almost the same as the teacher."
-        })
-    elif score >= 75:
-        feedback.append({
-            "title": "Overall Pronunciation",
-            "message": f"Very good. Your pronunciation of '{word}' is close to the teacher. Small improvements are possible."
-        })
-    elif score >= 60:
-        feedback.append({
-            "title": "Overall Pronunciation",
-            "message": f"Good attempt. You are understandable, but you can still improve clarity and smoothness for '{word}'."
-        })
-    else:
-        feedback.append({
-            "title": "Overall Pronunciation",
-            "message": f"You are trying well, but the sound of '{word}' is still far from the teacher. Please practise a few more times."
-        })
-    if dtw_sim >= 75:
-        feedback.append({
-            "title": "Rhythm and Timing",
-            "message": "Your timing and rhythm are close to the teacher. You are stressing the word in a similar way."
-        })
-    elif dtw_sim >= 55:
-        feedback.append({
-            "title": "Rhythm and Timing",
-            "message": "Your timing is acceptable, but you can make the word smoother. Try saying the word in one smooth breath."
-        })
-    else:
-        feedback.append({
-            "title": "Rhythm and Timing",
-            "message": "Your timing is quite different. Try to copy when the teacher starts and stops the word and keep a steady pace."
-        })
-    if corr_sim >= 75:
-        feedback.append({
-            "title": "Clarity of Sound",
-            "message": "Your sound shape is clear and close to the teacher. Mouth and tongue positions are mostly correct."
-        })
-    elif corr_sim >= 55:
-        feedback.append({
-            "title": "Clarity of Sound",
-            "message": "Your sound is partly clear. Try opening your mouth a little more and speak a bit more clearly."
-        })
-    else:
-        feedback.append({
-            "title": "Clarity of Sound",
-            "message": "The sound shape is quite different. Try to listen carefully and slowly copy the teacher sound."
-        })
-    feedback.append({
-        "title": "Practice Tip",
-        "message": "Listen to the teacher audio 2–3 times and then repeat slowly. Focus on copying the length and loudness of the sound."
-    })
-    passed_text = "You passed the target for this word." if score >= threshold else "You did not yet pass the target. Try again."
-    feedback.append({
-        "title": "Score",
-        "message": f"Waveform score: {score:.1f}/100. Target: {threshold:.1f}. {passed_text}"
-    })
-    return feedback
-# -------------------------------------------------------------------------
-# ROUTE: Generate Teacher Audio (download)
-# -------------------------------------------------------------------------
-@pron_bp.route("/generate_teacher_audio", methods=["POST"])
-def generate_teacher_audio():
-    word = request.form.get("word", "").strip().lower()
-    if not word:
-        return error_response("word_required", "Word required", 400)
-    ref = None
-    if "reference" in request.files:
-        rf = request.files["reference"]
-        fname = secure_filename(rf.filename)
-        path = os.path.join(REF_DIR, fname)
-        rf.save(path)
-        ref = path
-    out = os.path.join(AUDIO_DIR, f"{word}-{uuid.uuid4().hex}.wav")
-    try:
-        clone_voice(word, out, reference=ref)
-    except FileNotFoundError as e:
-        return error_response("reference_not_found", f"Reference audio not found: {e}", 500)
-    except RuntimeError as e:
-        return error_response("tts_unavailable", f"TTS unavailable: {e}", 503)
-    except Exception as e:
-        return error_response("tts_generation_failed", f"TTS generation failed: {e}", 500)
-    rel = os.path.relpath(out, STATIC_DIR).replace("\\", "/")
-    return jsonify({"url": rel})
-# -------------------------------------------------------------------------
-# ROUTE: Teacher Audio Stream
-# -------------------------------------------------------------------------
-@pron_bp.route("/generate_teacher_audio_stream", methods=["POST"])
-def generate_teacher_audio_stream():
-    word = request.form.get("word", "").strip().lower()
-    if not word:
-        return error_response("word_required", "Word required", 400)
-    ref_path = None
-    if "reference" in request.files:
-        try:
-            rf = request.files["reference"]
-            fname = secure_filename(rf.filename)
-            path = os.path.join(REF_DIR, fname)
-            rf.save(path)
-            ref_path = path
-        except Exception as e:
-            app_msg = f"reference save failed: {e}"
-            print(app_msg)
-            return error_response("reference_save_failed", app_msg, 500)
-    try:
-        data = clone_voice_bytes(word, reference=ref_path)
-        bio = io.BytesIO(data)
-        bio.seek(0)
-        return send_file(bio, mimetype="audio/wav", as_attachment=False)
-    except FileNotFoundError as e:
-        msg = f"Reference audio not found: {e}"
-        print("generate_teacher_audio_stream FileNotFoundError:", e)
-        return error_response("reference_not_found", msg, 500)
-    except RuntimeError as e:
-        msg = (
-            "Teacher voice model is not available on this server. "
-            "You can still practise pronunciation, but teacher audio cannot be generated."
-        )
-        print("generate_teacher_audio_stream RuntimeError (XTTS):", e)
-        return structured_feedback_error("tts_unavailable", msg, status=200)
-    except Exception as exc:
-        print("generate_teacher_audio_stream error:", exc)
-        return error_response("tts_generation_failed", f"TTS generation failed: {exc}", 500)
-# -------------------------------------------------------------------------
-# ROUTE: PRONUNCIATION CHECK
-# -------------------------------------------------------------------------
-@pron_bp.route("/check_pronunciation", methods=["POST"])
-def check_pronunciation():
-    if "audio" not in request.files:
-        return error_response("audio_required", "Audio required. Please record and try again.", 400)
-    word = request.form.get("word", "").strip().lower()
-    if not word:
-        return error_response("word_required", "Word required", 400)
-    mode = request.form.get("mode", "phonetics")
-    file = request.files["audio"]
-    y_student, sr = read_numpy(file)
-    silent, reason = detect_silence(y_student, sr)
-    if silent:
-        if reason == "too_short":
-            msg = "Recording was too short. Please speak clearly for at least 0.3 seconds."
-        elif reason == "too_quiet":
-            msg = "Recording too quiet. Increase microphone volume or speak louder."
-        else:
-            msg = "No audio detected. Please record again."
-        return jsonify({
-            "silent": True,
-            "reason": reason,
-            "suggestion": _make_suggestion_payload(msg),
-            "feedback": _make_suggestion_payload(msg),
-            "message": msg,
-        })
-    if mode == "waveform":
-        teacher_bytes = None
-        if "reference" in request.files:
-            try:
-                rf = request.files["reference"]
-                teacher_bytes = rf.read()
-            except Exception:
-                teacher_bytes = None
-        if teacher_bytes is None:
-            try:
-                teacher_bytes = clone_voice_bytes(word, reference=None)
-            except Exception:
-                teacher_bytes = None
-        if teacher_bytes is None:
-            return error_response("teacher_audio_unavailable", "Teacher audio not available", 500)
-        try:
-            y_teacher, sr_teacher = load_audio_from_bytes(teacher_bytes, sr=sr)
-        except Exception as e:
-            return error_response("teacher_load_failed", f"Failed to load teacher audio: {e}", 500)
-        sim = compute_waveform_similarity(y_teacher, y_student, sr=sr)
-        threshold = float(request.form.get("threshold", 65.0))
-        matched = (sim.get("similarity", 0.0) >= threshold)
-        feedback = build_waveform_feedback(word, sim, threshold)
-        return jsonify({
-            "mode": "waveform",
-            "silent": False,
-            "word": word,
-            "waveform_similarity": float(sim.get("similarity") or 0.0),
-            "waveformScore": float(sim.get("similarity") or 0.0),
-            "waveform_match": bool(matched),
-            "feedback": feedback,
-            "suggestion": feedback,
-            "details": {
-                "dtw_dist": sim.get("dtw_dist"),
-                "dtw_norm": sim.get("dtw_norm"),
-                "dtw_sim": sim.get("dtw_sim"),
-                "corr": sim.get("corr"),
-                "corr_sim": sim.get("corr_sim"),
-            },
-        })
-    heard = ""
-    if WHISPER_AVAILABLE:
-        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
-        file.stream.seek(0)
-        with open(tmp, "wb") as f:
-            f.write(file.read())
-        result = get_whisper().transcribe(tmp, language="en")
-        os.remove(tmp)
-        heard = normalize(result.get("text", ""))
-    if not heard:
-        return structured_feedback_error("no_asr", "Could not understand speech. Please try again.")
-    parts = heard.split()
-    if len(parts) > 1:
-        msg = f"Detected multiple words: '{heard}'. Please say only '{word}'."
-        return structured_feedback_error(
-            "multiple_words",
-            msg,
-            extra={"word": word, "heard_word": heard},
-        )
-    heard_word = parts[0]
-    teacher_ph = ipa_phonemes(word)
-    student_ph = ipa_phonemes(heard_word)
-    if not strong_word_match(word, heard_word, teacher_ph, student_ph):
-        msg = f"You said '{heard_word}'. Please say only '{word}'."
-        return structured_feedback_error(
-            "incorrect_word",
-            msg,
-            extra={"word": word, "heard_word": heard_word},
-        )
-    feedback = []
-    t_tokens = teacher_ph.split()
-    s_tokens = student_ph.split()
-    sm = SequenceMatcher(None, t_tokens, s_tokens)
-    for tag, i1, i2, j1, j2 in sm.get_opcodes():
-        if tag == "delete":
-            missing = t_tokens[i1:i2]
-            feedback.append({
-                "title": "Missing Sounds",
-                "message": f"You missed these sounds: {' '.join(missing)}. Try to say each sound clearly."
-            })
-        elif tag == "insert":
-            extra = s_tokens[j1:j2]
-            feedback.append({
-                "title": "Extra Sounds",
-                "message": f"You added extra sounds: {' '.join(extra)}. Try to keep only the sounds from the teacher word."
-            })
-        elif tag == "replace":
-            exp = t_tokens[i1:i2]
-            rec = s_tokens[j1:j2]
-            feedback.append({
-                "title": "Sound Substitution",
-                "message": f"Expected {' '.join(exp)} but you said {' '.join(rec)}. Listen again and copy the teacher sound."
-            })
-    vowels = "æɪiːʌəɑɒɔːeɜːuːʊɛ"
-    v_t = [p for p in teacher_ph if p in vowels]
-    v_s = [p for p in student_ph if p in vowels]
-    if v_t != v_s:
-        feedback.append({
-            "title": "Vowel Accuracy",
-            "message": "Your vowel sound is different. Open your mouth and copy the long or short sound of the teacher."
-        })
-    else:
-        feedback.append({
-            "title": "Vowel Accuracy",
-            "message": "Your vowel pronunciation is accurate and matches the teacher."
-        })
-    cons_t = [p for p in t_tokens if p and p[0] not in vowels]
-    cons_s = [p for p in s_tokens if p and p[0] not in vowels]
-    if cons_t != cons_s:
-        feedback.append({
-            "title": "Consonant Accuracy",
-            "message": "Some consonant sounds are different. Focus on the first and last sound of the word."
-        })
-    else:
-        feedback.append({
-            "title": "Consonant Accuracy",
-            "message": "Your consonant sounds match well with the teacher."
-        })
-    ph_sim = SequenceMatcher(None, teacher_ph, student_ph).ratio()
-    score = round(ph_sim * 100, 2)
-    if score >= 90:
-        overall_msg = f"Excellent. Your pronunciation of '{word}' is almost perfect."
-    elif score >= 75:
-        overall_msg = f"Very good. Your pronunciation of '{word}' is clear with small differences."
-    elif score >= 60:
-        overall_msg = f"Good attempt. People can understand '{word}', but you can improve some sounds."
-    else:
-        overall_msg = f"You are trying well, but you need more practice to say '{word}' like the teacher."
-    feedback.insert(0, {
-        "title": "Overall Score",
-        "message": f"Phoneme score: {score:.1f}/100. {overall_msg}"
-    })
-    feedback.append({
-        "title": "How To Say It",
-        "message": f"Correct IPA for '{word}': {teacher_ph}"
-    })
-    feedback.append({
-        "title": "Practice Tip",
-        "message": "Listen to the teacher voice, then repeat slowly 3 times. Focus on the first sound and the vowel in the middle."
-    })
-    return jsonify({
-        "silent": False,
-        "word": word,
-        "heard_word": heard_word,
-        "phoneme_teacher": teacher_ph,
-        "phoneme_student": student_ph,
-        "phoneme_similarity": float(ph_sim),
-        "phonemeSimilarity": float(ph_sim),
-        "phoneme_score": float(score),
-        "phonemeScore": float(score),
-        "feedback": feedback,
-        "suggestion": feedback,
-        "audio_url": None,
-    })

pronragg.py DELETED Viewed

@@ -1,263 +0,0 @@
-import os
-import json
-import base64
-import tempfile
-import subprocess
-import re
-import random
-from flask import Blueprint, request, jsonify
-from flask_cors import CORS
-from pydub import AudioSegment
-from faster_whisper import WhisperModel
-from rapidfuzz.distance import Levenshtein
-import chromadb
-pronragg_bp = Blueprint("pronragg", __name__)
-# --------------------------------------------------
-# CONFIG
-# --------------------------------------------------
-BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-VIDEO_PATH = os.path.join(BASE_DIR, "feedback.mp4")
-JSON_PATH = os.path.join(BASE_DIR, "teacher_feedback_sentences_category.json")
-CHROMA_DIR = os.path.join(BASE_DIR, "chroma_db")
-WHISPER_MODEL = "base"
-SAFE_PADDING = 0.05
-PAUSE_SECONDS = 0.5
-MAX_SEGMENTS_PER_CATEGORY = 3
-# Issue priority (VERY IMPORTANT)
-ISSUE_PRIORITY = [
-    "silence",
-    "multipleword",
-    "wrong_word",
-    "consonant",
-    "vowel",
-    "ending",
-    "syllable",
-    "stress",
-    "success"
-]
-# --------------------------------------------------
-# INIT MODELS
-# --------------------------------------------------
-whisper = WhisperModel(
-    WHISPER_MODEL,
-    device="cpu",
-    compute_type="int8"
-)
-# --------------------------------------------------
-# CHROMA INIT
-# --------------------------------------------------
-client = chromadb.PersistentClient(path=CHROMA_DIR)
-collection = client.get_or_create_collection("feedback")
-def init_segments():
-    if collection.count() > 0:
-        return
-    with open(JSON_PATH, "r", encoding="utf-8") as f:
-        data = json.load(f)
-    for item in data:
-        collection.add(
-            ids=[item["id"]],
-            documents=[item["text"]],
-            metadatas=[{
-                "category": item["category"],
-                "start": item["start"],
-                "end": item["end"]
-            }]
-        )
-init_segments()
-# --------------------------------------------------
-# HELPERS
-# --------------------------------------------------
-def normalize_text(text: str) -> str:
-    return re.sub(r"[^a-z]", "", text.lower().strip())
-def transcribe(wav_path: str) -> str:
-    segments, _ = whisper.transcribe(
-        wav_path,
-        language="en",
-        beam_size=5,
-        vad_filter=True
-    )
-    return "".join(s.text for s in segments).strip().lower()
-# --------------------------------------------------
-# PRONUNCIATION LOGIC (FIXED)
-# --------------------------------------------------
-def analyze(expected: str, heard_raw: str):
-    expected_n = normalize_text(expected)
-    heard_n = normalize_text(heard_raw)
-    if not heard_n:
-        return ["silence"], 0
-    if len(heard_raw.strip().split()) > 1:
-        return ["multipleword"], 20
-    similarity = Levenshtein.normalized_similarity(expected_n, heard_n)
-    score = int(similarity * 100)
-    if similarity < 0.30:
-        return ["wrong_word"], score
-    detected = []
-    vowels = "aeiou"
-    def is_vowel(ch: str) -> bool:
-        return ch in vowels
-    # First-letter mismatch: classify based on expected character category
-    if expected_n[0] != heard_n[0]:
-        if is_vowel(expected_n[0]):
-            detected.append("vowel")
-        else:
-            detected.append("consonant")
-    # Vowel sequence mismatch (only add if not already classified as a vowel)
-    expected_vowels = [c for c in expected_n if c in vowels]
-    heard_vowels = [c for c in heard_n if c in vowels]
-    if expected_vowels != heard_vowels and "vowel" not in detected:
-        detected.append("vowel")
-    # Ending error
-    if expected_n[-1] != heard_n[-1]:
-        detected.append("ending")
-    # Syllable error
-    if abs(len(expected_n) - len(heard_n)) >= 2:
-        detected.append("syllable")
-    # Stress error
-    if similarity < 0.85 and not detected:
-        detected.append("stress")
-    if not detected:
-        return ["success"], score
-    # Pick ONLY ONE issue using priority
-    for p in ISSUE_PRIORITY:
-        if p in detected:
-            return [p], score
-    return ["success"], score
-# --------------------------------------------------
-# FETCH SEGMENTS (STRICT)
-# --------------------------------------------------
-def fetch_segments(categories):
-    if not categories:
-        return []
-    category = categories[0]
-    result = collection.get(where={"category": category})
-    metas = result.get("metadatas", [])
-    # STRICT FILTER (important)
-    metas = [m for m in metas if m.get("category") == category]
-    if not metas:
-        return []
-    random.shuffle(metas)
-    return metas[:MAX_SEGMENTS_PER_CATEGORY]
-# --------------------------------------------------
-# BUILD VIDEO WITH FREEZE-HOLD PAUSE
-# --------------------------------------------------
-def build_video(segments):
-    if not segments:
-        return ""
-    segments = sorted(segments, key=lambda x: x["start"])
-    clips = []
-    for i, seg in enumerate(segments):
-        clip = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
-        pause = PAUSE_SECONDS if i < len(segments) - 1 else 0
-        subprocess.run(
-            [
-                "ffmpeg", "-y",
-                "-ss", str(max(0, seg["start"] - SAFE_PADDING)),
-                "-to", str(seg["end"] + SAFE_PADDING),
-                "-i", VIDEO_PATH,
-                "-vf", f"tpad=stop_mode=clone:stop_duration={pause}",
-                "-af", f"apad=pad_dur={pause}",
-                "-c:v", "libx264",
-                "-c:a", "aac",
-                "-movflags", "+faststart",
-                clip.name
-            ],
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.DEVNULL
-        )
-        clips.append(clip.name)
-    concat_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
-    with open(concat_file.name, "w") as f:
-        for c in clips:
-            f.write(f"file '{c}'\n")
-    final_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
-    subprocess.run(
-        [
-            "ffmpeg", "-y",
-            "-f", "concat",
-            "-safe", "0",
-            "-i", concat_file.name,
-            "-c:v", "libx264",
-            "-c:a", "aac",
-            final_video.name
-        ],
-        stdout=subprocess.DEVNULL,
-        stderr=subprocess.DEVNULL
-    )
-    with open(final_video.name, "rb") as f:
-        return base64.b64encode(f.read()).decode("utf-8")
-# --------------------------------------------------
-# API
-# --------------------------------------------------
-@pronragg_bp.route("/score", methods=["POST"])
-def score_pronunciation():
-    expected = request.form.get("word", "").strip()
-    audio = request.files.get("audio")
-    if not expected or not audio:
-        return jsonify({"error": "Missing input"}), 400
-    temp = tempfile.NamedTemporaryFile(delete=False, suffix=".webm")
-    audio.save(temp.name)
-    wav = temp.name.replace(".webm", ".wav")
-    AudioSegment.from_file(temp.name).export(wav, format="wav")
-    heard = transcribe(wav)
-    issues, score = analyze(expected, heard)
-    segments = fetch_segments(issues) or fetch_segments(["silence"])
-    video_blob = build_video(segments)
-    return jsonify({
-        "expected": expected,
-        "heard": heard,
-        "issues": issues,
-        "score": score,
-        "videoBlobBase64": video_blob
-    })

pronragupgrade.py → pronunciation.py RENAMED Viewed

@@ -5,24 +5,22 @@ import tempfile
 import subprocess
 import soundfile as sf
 import numpy as np
-import json
 import base64
 import random
 import chromadb
 import eng_to_ipa as ipa
-from flask import Flask, request, jsonify,Blueprint
-from flask_cors import CORS
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-pronragupgrade_bp = Blueprint("pronragupgrade", __name__)
 # ==================================================
 # 1. SETUP & CONFIG
 # ==================================================
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-VIDEO_PATH = os.path.join(BASE_DIR, "feedback.mp4")
-JSON_PATH = os.path.join(BASE_DIR, "teacher_feedback_sentences_category.json")
-CHROMA_DIR = os.path.join(BASE_DIR, "chroma_db")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_ID = "moxeeeem/wav2vec2-finetuned-pronunciation-correction"
@@ -38,97 +36,9 @@ model.eval()
 client = chromadb.PersistentClient(path=CHROMA_DIR)
 collection = client.get_or_create_collection("feedback")
-def init_segments():
-    if collection.count() > 0:
-        return
-    if not os.path.exists(JSON_PATH):
-        print(f"Warning: JSON file not found at {JSON_PATH}")
-        # Create more comprehensive dummy data with multiple clips per category
-        dummy_data = [
-            # Syllable category clips
-            {"id": 1, "text": "Let's work on syllable count", "category": "syllable", "start": 0, "end": 5},
-            {"id": 2, "text": "That word has multiple syllables", "category": "syllable", "start": 5, "end": 10},
-            {"id": 3, "text": "Make sure you pronounce all syllables", "category": "syllable", "start": 10, "end": 15},
-            # Ending category clips
-            {"id": 4, "text": "Focus on the ending sound", "category": "ending", "start": 15, "end": 20},
-            {"id": 5, "text": "Don't forget the final consonant", "category": "ending", "start": 20, "end": 25},
-            {"id": 6, "text": "Complete the word properly", "category": "ending", "start": 25, "end": 30},
-            # Vowel category clips
-            {"id": 7, "text": "Let's work on vowel sounds", "category": "vowel", "start": 30, "end": 35},
-            {"id": 8, "text": "The vowel should be clear", "category": "vowel", "start": 35, "end": 40},
-            {"id": 9, "text": "Focus on vowel quality", "category": "vowel", "start": 40, "end": 45},
-            # Consonant category clips
-            {"id": 10, "text": "Articulate consonants clearly", "category": "consonant", "start": 45, "end": 50},
-            {"id": 11, "text": "Consonants should be crisp", "category": "consonant", "start": 50, "end": 55},
-            {"id": 12, "text": "Work on consonant sounds", "category": "consonant", "start": 55, "end": 60},
-            # Stress category clips
-            {"id": 13, "text": "Focus on word stress", "category": "stress", "start": 60, "end": 65},
-            {"id": 14, "text": "Emphasize the correct syllable", "category": "stress", "start": 65, "end": 70},
-            {"id": 15, "text": "Watch your rhythm and stress", "category": "stress", "start": 70, "end": 75},
-            # Success category clips
-            {"id": 16, "text": "Excellent work!", "category": "success", "start": 75, "end": 80},
-            {"id": 17, "text": "Great pronunciation!", "category": "success", "start": 80, "end": 85},
-            {"id": 18, "text": "Keep up the good work!", "category": "success", "start": 85, "end": 90},
-            # Wrong word category clips
-            {"id": 19, "text": "That sounds like a different word", "category": "wrong_word", "start": 90, "end": 95},
-            {"id": 20, "text": "Please say the target word", "category": "wrong_word", "start": 95, "end": 100},
-            # Multiple words category clips
-            {"id": 21, "text": "Say just one word please", "category": "multiple_words", "start": 100, "end": 105},
-            {"id": 22, "text": "Focus on a single word", "category": "multiple_words", "start": 105, "end": 110},
-            # Silence category clips
-            {"id": 23, "text": "I couldn't hear anything", "category": "silence", "start": 110, "end": 115},
-            {"id": 24, "text": "Please speak louder", "category": "silence", "start": 115, "end": 120},
-            # Specific phoneme clips
-            {"id": 25, "text": "For the 'æ' sound like in cat", "category": "vowel", "phoneme": "æ", "start": 120, "end": 125},
-            {"id": 26, "text": "The 'r' should be soft", "category": "consonant", "phoneme": "r", "start": 125, "end": 130},
-            {"id": 27, "text": "The 'ɪ' sound is short", "category": "vowel", "phoneme": "ɪ", "start": 130, "end": 135},
-            {"id": 28, "text": "The 't' should be clear", "category": "consonant", "phoneme": "t", "start": 135, "end": 140},
-        ]
-        for item in dummy_data:
-            meta = {"category": item["category"], "start": item["start"], "end": item["end"]}
-            if "phoneme" in item:
-                meta["phoneme"] = item["phoneme"]
-            collection.add(ids=[str(item["id"])], documents=[item["text"]], metadatas=[meta])
-        print(f"Created {len(dummy_data)} dummy video segments in ChromaDB")
-        return
-    with open(JSON_PATH, "r", encoding="utf-8") as f:
-        data = json.load(f)
-    for item in data:
-        meta = {
-            "category": item["category"],
-            "start": item["start"],
-            "end": item["end"]
-        }
-        if "phoneme" in item:
-            meta["phoneme"] = item["phoneme"]
-        collection.add(
-            ids=[str(item["id"])],
-            documents=[item["text"]],
-            metadatas=[meta]
-        )
-    print(f"Loaded {len(data)} video segments into ChromaDB")
-init_segments()
 # ==================================================
 # 3. UK ENGLISH PRONUNCIATION SYSTEM
 # ==================================================
-# UK Phoneme Sound Database
 UK_PHONEME_DB = {
     "ɪ": {"name": "KIT vowel", "example": "sit", "tip": "Short front vowel", "type": "vowel"},
     "iː": {"name": "FLEECE vowel", "example": "see", "tip": "Long front vowel", "type": "vowel"},
@@ -142,13 +52,11 @@ UK_PHONEME_DB = {
     "ʌ": {"name": "STRUT vowel", "example": "cup", "tip": "Short mid back vowel", "type": "vowel"},
     "ɑː": {"name": "BATH vowel", "example": "father", "tip": "Long open back vowel", "type": "vowel"},
     "ɒ": {"name": "LOT vowel", "example": "hot", "tip": "Short open back rounded vowel", "type": "vowel"},
     "eɪ": {"name": "FACE diphthong", "example": "day", "tip": "Glide from e to ɪ", "type": "diphthong"},
     "aɪ": {"name": "PRICE diphthong", "example": "eye", "tip": "Glide from a to ɪ", "type": "diphthong"},
     "ɔɪ": {"name": "CHOICE diphthong", "example": "boy", "tip": "Glide from ɔ to ɪ", "type": "diphthong"},
     "aʊ": {"name": "MOUTH diphthong", "example": "now", "tip": "Glide from a to ʊ", "type": "diphthong"},
     "əʊ": {"name": "GOAT diphthong", "example": "go", "tip": "Glide from ə to ʊ", "type": "diphthong"},
     "p": {"name": "voiceless bilabial plosive", "example": "pen", "tip": "Explosive 'p' sound", "type": "consonant"},
     "b": {"name": "voiced bilabial plosive", "example": "bad", "tip": "Voiced 'b' with vibration", "type": "consonant"},
     "t": {"name": "voiceless alveolar plosive", "example": "tea", "tip": "Tongue tip on alveolar ridge", "type": "consonant"},
@@ -175,25 +83,9 @@ UK_PHONEME_DB = {
     "w": {"name": "labio-velar approximant", "example": "we", "tip": "Round lips", "type": "consonant"},
 }
-# Common words with syllable info
-COMMON_UK_WORDS = {
-    "rabbit": {"phonemes": ["r", "æ", "b", "ɪ", "t"], "syllables": 2, "stress": "first"},
-    "dog": {"phonemes": ["d", "ɒ", "ɡ"], "syllables": 1, "stress": "only"},
-    "cat": {"phonemes": ["k", "æ", "t"], "syllables": 1, "stress": "only"},
-    "water": {"phonemes": ["w", "ɔː", "t", "ə"], "syllables": 2, "stress": "first"},
-    "hello": {"phonemes": ["h", "ɛ", "l", "əʊ"], "syllables": 2, "stress": "second"},
-    "banana": {"phonemes": ["b", "ə", "n", "ɑː", "n", "ə"], "syllables": 3, "stress": "second"},
-    "computer": {"phonemes": ["k", "ə", "m", "p", "j", "uː", "t", "ə"], "syllables": 3, "stress": "second"},
-    "elephant": {"phonemes": ["ɛ", "l", "ɪ", "f", "ə", "n", "t"], "syllables": 3, "stress": "first"},
-}
 def get_uk_pronunciation(word):
-    """Get UK pronunciation with syllable info."""
     word_lower = word.lower().strip()
-    if word_lower in COMMON_UK_WORDS:
-        return COMMON_UK_WORDS[word_lower]["phonemes"]
     try:
         ipa_str = ipa.convert(word)
         clean_ipa = re.sub(r'[ˈˌː]', '', ipa_str)
@@ -211,10 +103,8 @@ def get_uk_pronunciation(word):
             i += 1
         return phonemes
-    except Exception as e:
-        print(f"Error getting IPA for {word}: {e}")
-        if word_lower == "rabbit":
-            return ["r", "æ", "b", "ɪ", "t"]
         phonemes = []
         for char in word_lower:
             if char in 'aeiou':
@@ -227,15 +117,6 @@ def get_uk_pronunciation(word):
         return phonemes
 def get_word_info(word):
-    """Get syllable and stress info for a word."""
-    word_lower = word.lower().strip()
-    if word_lower in COMMON_UK_WORDS:
-        return {
-            "syllables": COMMON_UK_WORDS[word_lower]["syllables"],
-            "stress": COMMON_UK_WORDS[word_lower]["stress"]
-        }
     phonemes = get_uk_pronunciation(word)
     vowel_count = sum(1 for p in phonemes
                      if UK_PHONEME_DB.get(p, {}).get('type') in ['vowel', 'diphthong'])
@@ -255,9 +136,7 @@ def get_word_info(word):
 # ==================================================
 # 4. CORRECTED PHONEME ANALYSIS
 # ==================================================
 def is_exact_phoneme_match(ref, stu):
-    """STRICT matching for accurate scoring."""
     if not stu:
         return False
@@ -278,8 +157,7 @@ def is_exact_phoneme_match(ref, stu):
     return False
-def analyze_pronunciation_strict(student_phonemes, reference_phonemes, word):
-    """STRICT analysis."""
     if not student_phonemes:
         return {
             "score": 0,
@@ -342,22 +220,7 @@ def analyze_pronunciation_strict(student_phonemes, reference_phonemes, word):
 # ==================================================
 # 5. SCENARIO DETECTION
 # ==================================================
 class ScenarioDetector:
-    """Scenario detection with correct priorities."""
-    SCENARIO_PRIORITIES = [
-        'silence',
-        'multiple_words',
-        'wrong_word',
-        'syllable',
-        'ending',
-        'vowel',
-        'consonant',
-        'stress',
-        'success',
-    ]
     @staticmethod
     def detect_silence(student_phonemes, audio_error=None):
         if audio_error:
@@ -664,7 +527,6 @@ class ScenarioDetector:
             ('stress', lambda: cls.detect_stress_issues(student_phonemes, reference_phonemes, word)),
             ('success', lambda: cls.detect_success(analysis_result, score)),
         ]
         for scenario_name, detector_func in detectors:
             result = detector_func()
@@ -682,214 +544,204 @@ class ScenarioDetector:
         }
 # ==================================================
-# 6. IMPROVED VIDEO RAG BUILDER - MERGES MULTIPLE PORTIONS
 # ==================================================
-# ==================================================
-# 6. IMPROVED VIDEO RAG BUILDER - SMART SELECTION
-# ==================================================
-def build_feedback_video(category, feedback_message, target_phoneme=None, student_errors=None):
-    """
-    Build feedback video with ordered, dynamic selection:
-    - success: [praise] -> [move-to-next]
-    - vowel: [specific phoneme] -> [one general]
-    - consonant: [specific phoneme] -> [one general]
-    - other categories: keep balanced/general strategies as before (2–3 clips)
-    Returns:
-        Base64 encoded video string with multiple merged clips
-    """
     print(f"\n=== Building video for: {category} ===")
     print(f"Target phoneme: {target_phoneme}")
-    print(f"Student errors: {student_errors}")
-    # Extract target phoneme from errors if not provided
-    if not target_phoneme and student_errors:
-        for error in student_errors:
-            if error.get("type") in ["vowel", "diphthong", "consonant"]:
-                target_phoneme = error.get("expected")
-                if target_phoneme:
-                    print(f"Extracted target phoneme from errors: {target_phoneme}")
-                    break
-    # Extract target phoneme from feedback if present
     if not target_phoneme:
         m = re.search(r"'([^']+)'", feedback_message)
         target_phoneme = m.group(1) if m else None
-        if target_phoneme:
-            print(f"Extracted target phoneme from feedback: {target_phoneme}")
     selected_metadatas = []
     try:
-        # Pull category clips
         gen_results = collection.get(where={"category": category})
-        if not gen_results or not gen_results.get('metadatas'):
             print(f"No clips found for category: {category}")
             return ""
-        metadatas = gen_results['metadatas']
-        documents = gen_results.get('documents', [])
-        # Safe zip in case of mismatch
         items = []
         for idx, meta in enumerate(metadatas):
             text = documents[idx] if idx < len(documents) else ""
             items.append({"meta": meta, "text": text})
-        # Split generic vs specific (for vowel/consonant)
         generic_clips = []
-        specific_clips = []  # list of tuples (meta, phoneme)
         for it in items:
             meta = it["meta"]
             clip_phoneme = meta.get("phoneme")
             if clip_phoneme:
-                specific_clips.append((meta, clip_phoneme))
             else:
-                # attach text for success/vowel/consonant classification later
                 meta_copy = dict(meta)
-                meta_copy["_text"] = it["text"]
                 generic_clips.append(meta_copy)
         print(f"Found {len(generic_clips)} generic clips, {len(specific_clips)} specific clips")
-        # Special ordering rules
-        if category == "success":
-            # First: praise message, then: move-next message (both random, dynamic)
-            praise_keywords = ["good", "great", "perfect", "excellent", "well done", "nice", "clear"]
-            next_keywords = ["next", "move"]
-            # Build pools from generic success clips using text
-            praise_pool = [m for m in generic_clips if any(k in m.get("_text", "").lower() for k in praise_keywords)]
-            next_pool = [m for m in generic_clips if any(k in m.get("_text", "").lower() for k in next_keywords)]
-            print(f"Success classification: praise={len(praise_pool)} next={len(next_pool)}")
-            # Pick first (praise) randomly
-            first_clip = random.choice(praise_pool) if praise_pool else (random.choice(generic_clips) if generic_clips else None)
-            # Pick second (move-next) randomly and ensure different from first
-            if next_pool:
-                next_candidates = [m for m in next_pool if f"{m.get('start')}_{m.get('end')}" != f"{first_clip.get('start')}_{first_clip.get('end')}" ] if first_clip else next_pool
-                second_clip = random.choice(next_candidates) if next_candidates else None
-            else:
-                # Fallback: pick any other success generic clip different from first
-                alt_candidates = [m for m in generic_clips if f"{m.get('start')}_{m.get('end')}" != f"{first_clip.get('start')}_{first_clip.get('end')}" ] if first_clip else generic_clips
-                second_clip = random.choice(alt_candidates) if len(alt_candidates) > 0 else None
-            selected_metadatas.clear()
-            if first_clip:
-                selected_metadatas.append(first_clip)
-            if second_clip:
-                selected_metadatas.append(second_clip)
-        elif category in ["vowel", "consonant"]:
-            # Specific first, then exactly one general
-            specific_found = False
-            # 1) exact phoneme
-            if target_phoneme:
-                for meta, clip_phoneme in specific_clips:
-                    if clip_phoneme == target_phoneme:
-                        selected_metadatas.append(meta)
-                        specific_found = True
-                        print(f"✓ Selected specific {category} clip for phoneme: {target_phoneme}")
-                        break
-            # 2) related fallback (mostly for vowels)
-            if not specific_found and target_phoneme and category == "vowel":
-                vowel_groups = {
-                    'ɪ': ['iː', 'i'], 'iː': ['ɪ', 'i'],
-                    'æ': ['a', 'ɑː'], 'ɑː': ['æ', 'a'],
-                    'ʊ': ['uː', 'u'], 'uː': ['ʊ', 'u'],
-                    'ɒ': ['ɔ', 'ɔː'], 'ɔː': ['ɒ', 'ɔ'],
-                }
-                related_phonemes = vowel_groups.get(target_phoneme, [])
-                for meta, clip_phoneme in specific_clips:
-                    if clip_phoneme in related_phonemes:
-                        selected_metadatas.append(meta)
-                        specific_found = True
-                        print(f"✓ Selected related vowel clip: {clip_phoneme} for target {target_phoneme}")
-                        break
-            # 3) If still not found and we have any specific clip with same category, prefer one that exists
-            if not specific_found and specific_clips:
-                fallback_meta, fallback_ph = random.choice(specific_clips)
-                selected_metadatas.append(fallback_meta)
-                specific_found = True
-                print(f"✓ Fallback to available specific {category} clip: {fallback_ph}")
-            # Then exactly one general
-            if generic_clips:
-                general_choice = random.choice(generic_clips)
-                selected_metadatas.append(general_choice)
-                print("✓ Added one general clip after specific")
-            # Note: If no generic and only specific found, we keep only one clip.
-            # If no specific and generic exists, we keep one general clip (as requested “only one general”).
         else:
-            # Keep existing smart strategy for other categories
-            selection_strategy = "balanced"
-            if category in ["syllable", "ending", "stress"]:
-                selection_strategy = "general_focus"
-            print(f"Using selection strategy: {selection_strategy}")
-            if selection_strategy == "general_focus":
-                if generic_clips:
-                    selected_generic = random.sample(generic_clips, min(2, len(generic_clips)))
-                    selected_metadatas.extend(selected_generic)
-                # Add a specific if relevant and space remains
-                if target_phoneme and len(selected_metadatas) < 3:
-                    for meta, clip_phoneme in specific_clips:
-                        if clip_phoneme == target_phoneme:
-                            selected_metadatas.append(meta)
-                            print(f"✓ Added specific clip for: {target_phoneme}")
-                            break
             else:
-                # balanced
-                if generic_clips:
-                    selected_metadatas.append(random.choice(generic_clips))
-                if target_phoneme:
-                    for meta, clip_phoneme in specific_clips:
-                        if clip_phoneme == target_phoneme:
-                            selected_metadatas.append(meta)
-                            print(f"✓ Selected specific clip for: {target_phoneme}")
-                            break
-                # Fill with additional generic if needed
                 if len(selected_metadatas) < 2 and generic_clips:
-                    remaining = [c for c in generic_clips if c not in selected_metadatas]
                     if remaining:
                         selected_metadatas.append(random.choice(remaining))
-        # Remove duplicates while preserving order
         unique_metadatas = []
         seen = set()
         for meta in selected_metadatas:
-            key = f"{meta.get('start')}_{meta.get('end')}"
             if key not in seen:
                 seen.add(key)
                 unique_metadatas.append(meta)
         selected_metadatas = unique_metadatas
-        # Ensure minimum clips but DO NOT violate vowel/consonant rule (only one general)
-        if category not in ["vowel", "consonant"]:
-            if len(selected_metadatas) < 2 and generic_clips:
-                needed = 2 - len(selected_metadatas)
-                remaining = [c for c in generic_clips if c not in selected_metadatas]
-                if remaining:
-                    selected_metadatas.extend(random.sample(remaining, min(needed, len(remaining))))
         if len(selected_metadatas) == 0:
             print("No clips selected after filtering.")
             return ""
         print(f"Selected {len(selected_metadatas)} video clips:")
-        for i, meta in enumerate(selected_metadatas):
-            phoneme = meta.get('phoneme', 'generic')
-            print(f"  Clip {i+1}: {meta.get('category')} - {phoneme} [{meta.get('start')}->{meta.get('end')}]")
         # --- FFmpeg Processing ---
         if not os.path.exists(VIDEO_PATH):
@@ -901,37 +753,43 @@ def build_feedback_video(category, feedback_message, target_phoneme=None, studen
         final_video_path = None
         try:
-            # Extract individual clips
             for i, seg in enumerate(selected_metadatas):
                 tmp_clip = tempfile.NamedTemporaryFile(delete=False, suffix=f"_{i}.mp4")
                 tmp_clip.close()
-                # Extract segment
-                subprocess.run([
-                    "ffmpeg", "-y", "-ss", str(seg["start"]), "-to", str(seg["end"]),
-                    "-i", VIDEO_PATH, "-c:v", "libx264", "-preset", "ultrafast",
-                    "-crf", "28", "-c:a", "aac", tmp_clip.name
-                ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
                 clips.append(tmp_clip.name)
-            # Create concat list
             concat_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w")
             for clip_path in clips:
                 concat_file.write(f"file '{os.path.abspath(clip_path)}'\n")
             concat_file.close()
-            # Create final video
             final_video_path = tempfile.NamedTemporaryFile(delete=False, suffix="_final.mp4")
             final_video_path.close()
-            # Concatenate
-            subprocess.run([
-                "ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", concat_file.name,
-                "-c", "copy", final_video_path.name
-            ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-            # Encode to Base64
             with open(final_video_path.name, "rb") as f:
                 v_data = base64.b64encode(f.read()).decode()
@@ -943,7 +801,6 @@ def build_feedback_video(category, feedback_message, target_phoneme=None, studen
             return ""
         finally:
-            # Cleanup
             if concat_file and os.path.exists(concat_file.name):
                 os.remove(concat_file.name)
@@ -957,12 +814,11 @@ def build_feedback_video(category, feedback_message, target_phoneme=None, studen
     except Exception as e:
         print(f"✗ Video generation error: {e}")
         return ""
 # ==================================================
 # 7. AUDIO PROCESSING
 # ==================================================
 def process_audio_file(audio_path):
-    """Process audio file."""
     try:
         wav_path = audio_path.replace('.webm', '.wav')
@@ -1007,53 +863,13 @@ def process_audio_file(audio_path):
         return None, f"error: {str(e)}"
 # ==================================================
-# 8. TEST VIDEO GENERATION
 # ==================================================
-def test_video_generation():
-    """Test that video generation merges multiple clips."""
-    print("\n=== TESTING VIDEO GENERATION ===")
-    test_cases = [
-        {"category": "syllable", "feedback": "Syllable issue", "target_phoneme": None},
-        {"category": "vowel", "feedback": "Vowel issue for 'æ'", "target_phoneme": "æ"},
-        {"category": "consonant", "feedback": "Consonant issue for 'r'", "target_phoneme": "r"},
-        {"category": "ending", "feedback": "Missing final 't'", "target_phoneme": "t"},
-    ]
-    for test in test_cases:
-        print(f"\nTesting category: {test['category']}")
-        video_blob = build_feedback_video(
-            test['category'],
-            test['feedback'],
-            test['target_phoneme']
-        )
-        if video_blob:
-            print(f"✓ Video generated successfully ({len(video_blob)} bytes)")
-            print(f"  Contains multiple merged clips")
-        else:
-            print(f"✗ Failed to generate video")
-        # Also test with just the feedback message
-        video_blob2 = build_feedback_video(
-            test['category'],
-            test['feedback']
-        )
-        if video_blob2:
-            print(f"✓ Video also works without explicit target phoneme")
-    print("\n" + "="*60)
-# ==================================================
-# 9. MAIN ENDPOINT
-# ==================================================
-@pronragupgrade_bp.route("/score", methods=["POST"])
 def train_pronunciation():
-    """Main endpoint with multi-clip video feedback."""
     try:
         word = request.form.get('word', '').strip().lower()
         if not word:
             return jsonify({
@@ -1078,17 +894,11 @@ def train_pronunciation():
         print(f"\n=== Processing: '{word}' ===")
         try:
-            # Process audio
             student_phonemes, audio_error = process_audio_file(temp_path)
-            # Get reference
             reference_phonemes = get_uk_pronunciation(word)
-            # Analyze
-            analysis = analyze_pronunciation_strict(student_phonemes, reference_phonemes, word)
             score = analysis["score"]
-            # Detect scenario
             scenario_info = ScenarioDetector.detect_scenarios(
                 student_phonemes=student_phonemes,
                 reference_phonemes=reference_phonemes,
@@ -1103,11 +913,9 @@ def train_pronunciation():
             action = scenario_info.get('action', '')
             target_phoneme = scenario_info.get('target_phoneme')
-            # Generate video with MULTIPLE clips
             print(f"Generating video for category: {category}")
             video_blob = build_feedback_video(category, feedback, target_phoneme)
-            # Prepare response
             response = {
                 "success": True,
                 "scenario": scenario,
@@ -1144,3 +952,4 @@ def train_pronunciation():
             "scenario": "system_error"
         }), 500

 import subprocess
 import soundfile as sf
 import numpy as np
 import base64
 import random
 import chromadb
 import eng_to_ipa as ipa
+from flask import Blueprint
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+pronunciation_bp = Blueprint("pronunciation", __name__)
 # ==================================================
 # 1. SETUP & CONFIG
 # ==================================================
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+print("BASE_DIR:", BASE_DIR)
+VIDEO_PATH = os.path.join(BASE_DIR, "assets/feedback.mp4")
+CHROMA_DIR = os.path.join(BASE_DIR, "assets/chroma_db")
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_ID = "moxeeeem/wav2vec2-finetuned-pronunciation-correction"
 client = chromadb.PersistentClient(path=CHROMA_DIR)
 collection = client.get_or_create_collection("feedback")
 # ==================================================
 # 3. UK ENGLISH PRONUNCIATION SYSTEM
 # ==================================================
 UK_PHONEME_DB = {
     "ɪ": {"name": "KIT vowel", "example": "sit", "tip": "Short front vowel", "type": "vowel"},
     "iː": {"name": "FLEECE vowel", "example": "see", "tip": "Long front vowel", "type": "vowel"},
     "ʌ": {"name": "STRUT vowel", "example": "cup", "tip": "Short mid back vowel", "type": "vowel"},
     "ɑː": {"name": "BATH vowel", "example": "father", "tip": "Long open back vowel", "type": "vowel"},
     "ɒ": {"name": "LOT vowel", "example": "hot", "tip": "Short open back rounded vowel", "type": "vowel"},
     "eɪ": {"name": "FACE diphthong", "example": "day", "tip": "Glide from e to ɪ", "type": "diphthong"},
     "aɪ": {"name": "PRICE diphthong", "example": "eye", "tip": "Glide from a to ɪ", "type": "diphthong"},
     "ɔɪ": {"name": "CHOICE diphthong", "example": "boy", "tip": "Glide from ɔ to ɪ", "type": "diphthong"},
     "aʊ": {"name": "MOUTH diphthong", "example": "now", "tip": "Glide from a to ʊ", "type": "diphthong"},
     "əʊ": {"name": "GOAT diphthong", "example": "go", "tip": "Glide from ə to ʊ", "type": "diphthong"},
     "p": {"name": "voiceless bilabial plosive", "example": "pen", "tip": "Explosive 'p' sound", "type": "consonant"},
     "b": {"name": "voiced bilabial plosive", "example": "bad", "tip": "Voiced 'b' with vibration", "type": "consonant"},
     "t": {"name": "voiceless alveolar plosive", "example": "tea", "tip": "Tongue tip on alveolar ridge", "type": "consonant"},
     "w": {"name": "labio-velar approximant", "example": "we", "tip": "Round lips", "type": "consonant"},
 }
 def get_uk_pronunciation(word):
     word_lower = word.lower().strip()
     try:
         ipa_str = ipa.convert(word)
         clean_ipa = re.sub(r'[ˈˌː]', '', ipa_str)
             i += 1
         return phonemes
+    except Exception:
+        # Simple fallback for basic words
         phonemes = []
         for char in word_lower:
             if char in 'aeiou':
         return phonemes
 def get_word_info(word):
     phonemes = get_uk_pronunciation(word)
     vowel_count = sum(1 for p in phonemes
                      if UK_PHONEME_DB.get(p, {}).get('type') in ['vowel', 'diphthong'])
 # ==================================================
 # 4. CORRECTED PHONEME ANALYSIS
 # ==================================================
 def is_exact_phoneme_match(ref, stu):
     if not stu:
         return False
     return False
+def analyze_pronunciation_strict(student_phonemes, reference_phonemes):
     if not student_phonemes:
         return {
             "score": 0,
 # ==================================================
 # 5. SCENARIO DETECTION
 # ==================================================
 class ScenarioDetector:
     @staticmethod
     def detect_silence(student_phonemes, audio_error=None):
         if audio_error:
             ('stress', lambda: cls.detect_stress_issues(student_phonemes, reference_phonemes, word)),
             ('success', lambda: cls.detect_success(analysis_result, score)),
         ]
         for scenario_name, detector_func in detectors:
             result = detector_func()
         }
 # ==================================================
+# 6. VIDEO RAG BUILDER
 # ==================================================
+def build_feedback_video(category, feedback_message, target_phoneme=None):
     print(f"\n=== Building video for: {category} ===")
     print(f"Target phoneme: {target_phoneme}")
     if not target_phoneme:
         m = re.search(r"'([^']+)'", feedback_message)
         target_phoneme = m.group(1) if m else None
     selected_metadatas = []
     try:
         gen_results = collection.get(where={"category": category})
+        if not gen_results or not gen_results.get("metadatas"):
             print(f"No clips found for category: {category}")
             return ""
+        metadatas = gen_results["metadatas"]
+        documents = gen_results.get("documents", [])
         items = []
         for idx, meta in enumerate(metadatas):
             text = documents[idx] if idx < len(documents) else ""
             items.append({"meta": meta, "text": text})
+        # Split into:
+        # - specific clips = has phoneme in metadata
+        # - generic clips = no phoneme in metadata
         generic_clips = []
+        specific_clips = []
         for it in items:
             meta = it["meta"]
+            text = it["text"] or ""
             clip_phoneme = meta.get("phoneme")
             if clip_phoneme:
+                specific_clips.append({"meta": meta, "phoneme": clip_phoneme, "text": text})
             else:
                 meta_copy = dict(meta)
+                meta_copy["_text"] = text
                 generic_clips.append(meta_copy)
         print(f"Found {len(generic_clips)} generic clips, {len(specific_clips)} specific clips")
+        def _seg_key(m):
+            return f"{m.get('start')}_{m.get('end')}"
+        def pick_generic(exclude_keys=None):
+            exclude_keys = exclude_keys or set()
+            pool = [m for m in generic_clips if _seg_key(m) not in exclude_keys]
+            if pool:
+                return random.choice(pool)
+            return None
+        def pick_specific_for_phoneme(target, related_map=None, exclude_keys=None):
+            exclude_keys = exclude_keys or set()
+            related_map = related_map or {}
+            # 1) exact match
+            if target:
+                for it in specific_clips:
+                    if it["phoneme"] == target and _seg_key(it["meta"]) not in exclude_keys:
+                        return it["meta"]
+            # 2) related (mainly for vowels)
+            if target and target in related_map:
+                for rel in related_map[target]:
+                    for it in specific_clips:
+                        if it["phoneme"] == rel and _seg_key(it["meta"]) not in exclude_keys:
+                            return it["meta"]
+            # 3) fallback any specific
+            pool = [it["meta"] for it in specific_clips if _seg_key(it["meta"]) not in exclude_keys]
+            if pool:
+                return random.choice(pool)
+            return None
+        # -------------------------
+        # REQUIRED CHANGE:
+        # For vowel/consonant:
+        # Always try to return TWO clips in this order:
+        # 1) specific phoneme clip (target phoneme)
+        # 2) general clip (generic feedback of vowel/consonant)
+        # -------------------------
+        if category in ["vowel", "consonant"]:
+            exclude = set()
+            vowel_groups = {
+                "ɪ": ["iː", "i"], "iː": ["ɪ", "i"],
+                "æ": ["a", "ɑː"], "ɑː": ["æ", "a"],
+                "ʊ": ["uː", "u"], "uː": ["ʊ", "u"],
+                "ɒ": ["ɔ", "ɔː"], "ɔː": ["ɒ", "ɔ"],
+            }
+            related_map = vowel_groups if category == "vowel" else {}
+            # 1) Pick SPECIFIC (phoneme)
+            specific_meta = pick_specific_for_phoneme(target_phoneme, related_map=related_map, exclude_keys=exclude)
+            if specific_meta:
+                selected_metadatas.append(specific_meta)
+                exclude.add(_seg_key(specific_meta))
+                print(f"✓ Selected specific {category} clip for phoneme: {target_phoneme}")
+            # 2) Pick GENERAL (generic)
+            generic_meta = pick_generic(exclude_keys=exclude)
+            if generic_meta:
+                selected_metadatas.append(generic_meta)
+                exclude.add(_seg_key(generic_meta))
+                print("✓ Selected general (generic) clip")
+            # If still not 2 clips, try to fill with another different clip (best effort)
+            if len(selected_metadatas) < 2:
+                # try another generic first
+                extra_generic = pick_generic(exclude_keys=exclude)
+                if extra_generic:
+                    selected_metadatas.append(extra_generic)
+                    exclude.add(_seg_key(extra_generic))
+                    print("✓ Filled missing slot with another generic clip")
+            if len(selected_metadatas) < 2:
+                # try another specific as last fallback
+                extra_specific = pick_specific_for_phoneme(None, related_map=None, exclude_keys=exclude)
+                if extra_specific:
+                    selected_metadatas.append(extra_specific)
+                    exclude.add(_seg_key(extra_specific))
+                    print("✓ Filled missing slot with another specific clip")
+            # If we still cannot make 2 clips, we proceed with whatever we have.
+            # (Because the DB may not have enough clips.)
+            if not selected_metadatas:
+                print("✗ No clips selected for vowel/consonant.")
+                return ""
+        # -------------------------
+        # Existing logic for other categories (unchanged)
+        # -------------------------
         else:
+            if category == "success":
+                praise_keywords = ["good", "great", "perfect", "excellent", "well done", "nice", "clear"]
+                next_keywords = ["next", "move"]
+                praise_pool = [m for m in generic_clips if any(k in m.get("_text", "").lower() for k in praise_keywords)]
+                next_pool = [m for m in generic_clips if any(k in m.get("_text", "").lower() for k in next_keywords)]
+                print(f"Success classification: praise={len(praise_pool)} next={len(next_pool)}")
+                first_clip = random.choice(praise_pool) if praise_pool else (random.choice(generic_clips) if generic_clips else None)
+                if next_pool:
+                    next_candidates = [m for m in next_pool if _seg_key(m) != _seg_key(first_clip)] if first_clip else next_pool
+                    second_clip = random.choice(next_candidates) if next_candidates else None
+                else:
+                    alt_candidates = [m for m in generic_clips if _seg_key(m) != _seg_key(first_clip)] if first_clip else generic_clips
+                    second_clip = random.choice(alt_candidates) if len(alt_candidates) > 0 else None
+                selected_metadatas.clear()
+                if first_clip:
+                    selected_metadatas.append(first_clip)
+                if second_clip:
+                    selected_metadatas.append(second_clip)
             else:
+                selection_strategy = "balanced"
+                if category in ["syllable", "ending", "stress"]:
+                    selection_strategy = "general_focus"
+                print(f"Using selection strategy: {selection_strategy}")
+                if selection_strategy == "general_focus":
+                    if generic_clips:
+                        selected_generic = random.sample(generic_clips, min(2, len(generic_clips)))
+                        selected_metadatas.extend(selected_generic)
+                else:
+                    if generic_clips:
+                        selected_metadatas.append(random.choice(generic_clips))
+                # ensure at least 2 clips when possible
                 if len(selected_metadatas) < 2 and generic_clips:
+                    remaining = [c for c in generic_clips if _seg_key(c) not in {_seg_key(x) for x in selected_metadatas}]
                     if remaining:
                         selected_metadatas.append(random.choice(remaining))
+        # Deduplicate (safety)
         unique_metadatas = []
         seen = set()
         for meta in selected_metadatas:
+            key = _seg_key(meta)
             if key not in seen:
                 seen.add(key)
                 unique_metadatas.append(meta)
         selected_metadatas = unique_metadatas
         if len(selected_metadatas) == 0:
             print("No clips selected after filtering.")
             return ""
         print(f"Selected {len(selected_metadatas)} video clips:")
         # --- FFmpeg Processing ---
         if not os.path.exists(VIDEO_PATH):
         final_video_path = None
         try:
             for i, seg in enumerate(selected_metadatas):
                 tmp_clip = tempfile.NamedTemporaryFile(delete=False, suffix=f"_{i}.mp4")
                 tmp_clip.close()
+                subprocess.run(
+                    [
+                        "ffmpeg", "-y",
+                        "-ss", str(seg["start"]), "-to", str(seg["end"]),
+                        "-i", VIDEO_PATH,
+                        "-c:v", "libx264", "-preset", "ultrafast",
+                        "-crf", "28", "-c:a", "aac",
+                        tmp_clip.name
+                    ],
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.DEVNULL,
+                )
                 clips.append(tmp_clip.name)
             concat_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w")
             for clip_path in clips:
                 concat_file.write(f"file '{os.path.abspath(clip_path)}'\n")
             concat_file.close()
             final_video_path = tempfile.NamedTemporaryFile(delete=False, suffix="_final.mp4")
             final_video_path.close()
+            subprocess.run(
+                [
+                    "ffmpeg", "-y",
+                    "-f", "concat", "-safe", "0", "-i", concat_file.name,
+                    "-c", "copy", final_video_path.name
+                ],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
             with open(final_video_path.name, "rb") as f:
                 v_data = base64.b64encode(f.read()).decode()
             return ""
         finally:
             if concat_file and os.path.exists(concat_file.name):
                 os.remove(concat_file.name)
     except Exception as e:
         print(f"✗ Video generation error: {e}")
         return ""
 # ==================================================
 # 7. AUDIO PROCESSING
 # ==================================================
 def process_audio_file(audio_path):
     try:
         wav_path = audio_path.replace('.webm', '.wav')
         return None, f"error: {str(e)}"
 # ==================================================
+# 8. MAIN ENDPOINT
 # ==================================================
+@pronunciation_bp.route("/score", methods=["POST"])
 def train_pronunciation():
     try:
+        from flask import request, jsonify
         word = request.form.get('word', '').strip().lower()
         if not word:
             return jsonify({
         print(f"\n=== Processing: '{word}' ===")
         try:
             student_phonemes, audio_error = process_audio_file(temp_path)
             reference_phonemes = get_uk_pronunciation(word)
+            analysis = analyze_pronunciation_strict(student_phonemes, reference_phonemes)
             score = analysis["score"]
             scenario_info = ScenarioDetector.detect_scenarios(
                 student_phonemes=student_phonemes,
                 reference_phonemes=reference_phonemes,
             action = scenario_info.get('action', '')
             target_phoneme = scenario_info.get('target_phoneme')
             print(f"Generating video for category: {category}")
             video_blob = build_feedback_video(category, feedback, target_phoneme)
             response = {
                 "success": True,
                 "scenario": scenario,
             "scenario": "system_error"
         }), 500

pronvideo.py DELETED Viewed

@@ -1,359 +0,0 @@
-import os
-import io
-import tempfile
-from flask import Flask, Blueprint, request, jsonify
-from flask_cors import CORS
-from pydub import AudioSegment
-from rapidfuzz.distance import Levenshtein
-# ASR - WhisperX (or Faster Whisper for Forced Alignment)
-try:
-    from faster_whisper import WhisperModel
-    HAS_WHISPER = True
-except Exception:
-    HAS_WHISPER = False
-# Initialize the Flask app and Blueprint
-pronvideo_bp = Blueprint("pronvideo", __name__)
-# -----------------------------
-# Load Whisper model (CPU friendly)
-# -----------------------------
-WHISPER_MODEL_SIZE = os.getenv("WHISPER_MODEL_SIZE", "base")
-whisper_model = None
-if HAS_WHISPER:
-    whisper_model = WhisperModel(
-        WHISPER_MODEL_SIZE,
-        device="cpu",
-        compute_type="int8"
-    )
-# -----------------------------
-# Helpers
-# -----------------------------
-def normalize(text: str) -> str:
-    return "".join(ch for ch in text.lower().strip() if ch.isalpha() or ch.isspace())
-def phoneme_similarity_score(expected_ph: str, spoken_ph: str) -> int:
-    if not expected_ph or not spoken_ph:
-        return 0
-    dist = Levenshtein.distance(expected_ph, spoken_ph)
-    max_len = max(len(expected_ph), len(spoken_ph))
-    similarity = 1 - (dist / max_len)
-    score = int(round(similarity * 100))
-    return max(0, min(100, score))
-def convert_to_wav_temp(upload_file) -> str:
-    upload_file.stream.seek(0)
-    raw = upload_file.stream.read()
-    bio = io.BytesIO(raw)
-    ext = os.path.splitext(upload_file.filename)[1].replace(".", "").lower() or None
-    try:
-        audio = AudioSegment.from_file(bio, format=ext if ext else None)
-    except Exception:
-        bio.seek(0)
-        audio = AudioSegment.from_file(bio)
-    audio = audio.set_channels(1).set_frame_rate(16000)
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    audio.export(tmp.name, format="wav")
-    return tmp.name
-def transcribe_audio(audio_path: str) -> str:
-    if not HAS_WHISPER or whisper_model is None:
-        raise RuntimeError("Whisper ASR is not installed/available.")
-    segments, info = whisper_model.transcribe(
-        audio_path,
-        language="en",
-        vad_filter=True
-    )
-    text_parts = []
-    for seg in segments:
-        if seg.text:
-            text_parts.append(seg.text.strip())
-    return " ".join(text_parts).strip()
-# -----------------------------
-# Video feedback helpers
-# -----------------------------
-def static_video_for(kind: str):
-    mapping = {
-        "success": {"videoId": "video-success", "videoUrl": "/assets/pronvideo/feedback/success.mp4", "hint": "Great job! Keep going."},
-        "silence": {"videoId": "video-silence", "videoUrl": "/assets/pronvideo/feedback/silence.mp4", "hint": "Speak clearly into the mic for at least a second."},
-        "wrong_word": {"videoId": "video-wrong-word", "videoUrl": "/assets/pronvideo/feedback/wrongword.mp4", "hint": "Please say only the target word."},
-        "vowel": {"videoId": "video-vowel", "videoUrl": "/assets/pronvideo/feedback/vowel.mp4", "hint": "Work on vowel shape and length."},
-        "consonant": {"videoId": "video-consonant", "videoUrl": "/assets/pronvideo/feedback/consonant.mp4", "hint": "Focus on consonant articulation, especially start/end sounds."},
-        "stress": {"videoId": "video-stress", "videoUrl": "/assets/pronvideo/feedback/stress.mp4", "hint": "Emphasize the primary stressed syllable."},
-        "syllable": {"videoId": "video-syllable", "videoUrl": "/assets/pronvideo/feedback/syllable.mp4", "hint": "Match the number of syllables and rhythm."},
-        "ending": {"videoId": "video-ending", "videoUrl": "/assets/pronvideo/feedback/ending.mp4", "hint": "Work on the final sound—try to finish the word cleanly."},
-        "multipleword": {"videoId": "video-multipleword", "videoUrl": "/assets/pronvideo/feedback/multipleword.mp4", "hint": "Please say only the target word, not multiple words."},
-    }
-    return mapping.get(kind, {"videoId": None, "videoUrl": None, "hint": None})
-# -----------------------------
-# Function to detect feedback based on pronunciation
-# -----------------------------
-def vowel_consonant_feedback(teacher_ph: str, student_ph: str):
-    feedback = []
-    # Split the IPA tokens into vowels and consonants
-    t_tokens = split_ipa_tokens(teacher_ph)
-    s_tokens = split_ipa_tokens(student_ph)
-    # Vowel sequence check
-    v_t = extract_vowel_sequence(teacher_ph)
-    v_s = extract_vowel_sequence(student_ph)
-    if v_t != v_s:
-        feedback.append({
-            "title": "Vowel Accuracy",
-            "message": "Your vowel sound is different. Focus on long/short quality and mouth opening."
-        })
-    # Consonant sequence check
-    cons_t = extract_consonant_tokens(t_tokens)
-    cons_s = extract_consonant_tokens(s_tokens)
-    if cons_t != cons_s:
-        feedback.append({
-            "title": "Consonant Accuracy",
-            "message": "Some consonant sounds differ. Pay attention to start and end sounds."
-        })
-    # Ending sound check
-    end_t = last_ending_token(t_tokens)
-    end_s = last_ending_token(s_tokens)
-    if end_t and end_s and end_t != end_s:
-        feedback.append({
-            "title": "Ending Sound",
-            "message": f"The final sound differs. Try to end with '{end_t}'."
-        })
-    return feedback
-# -----------------------------
-# Syllable estimation logic
-# -----------------------------
-def syllable_estimate(ipa: str):
-    count = 0
-    in_vowel = False
-    for ch in ipa:
-        if ch in VOWELS:
-            if not in_vowel:
-                count += 1
-                in_vowel = True
-            else:
-                in_vowel = False
-        else:
-            in_vowel = False
-    return max(1, count)  # at least 1 syllable
-def select_video_for_vc(teacher_ph: str, student_ph: str) -> str:
-    # Early check: if overall similarity is very low, treat as wrong word
-    score = phoneme_similarity_score(teacher_ph, student_ph)
-    # threshold chosen empirically; adjust (0-100). <40 => likely a different word.
-    if score < 40:
-        return "wrong_word"
-    tokens_t = split_ipa_tokens(teacher_ph)
-    tokens_s = split_ipa_tokens(student_ph)
-    v_t = extract_vowel_sequence(teacher_ph)
-    v_s = extract_vowel_sequence(student_ph)
-    cons_t = extract_consonant_tokens(tokens_t)
-    cons_s = extract_consonant_tokens(tokens_s)
-    end_t = last_ending_token(tokens_t)
-    end_s = last_ending_token(tokens_s)
-    stress_t = primary_stress_position(tokens_t)
-    stress_s = primary_stress_position(tokens_s)
-    syl_t = syllable_estimate(teacher_ph)
-    syl_s = syllable_estimate(student_ph)
-    flags = []
-    if v_t != v_s:
-        flags.append("vowel")
-    if cons_t != cons_s:
-        flags.append("consonant")
-    if end_t and end_s and end_t != end_s:
-        flags.append("ending")
-    if stress_t is not None and stress_s is not None and stress_t != stress_s:
-        flags.append("stress")
-    if syl_t != syl_s:
-        flags.append("syllable")
-    if not flags:
-        return "success"  # Correct pronunciation
-    if len(flags) == 1:
-        return flags[0]  # Return the first mismatch type
-    return "mixed"  # Return mixed if multiple issues are found
-# -----------------------------
-# Route: Score pronunciation with targeted feedback
-# -----------------------------
-@pronvideo_bp.route("/score", methods=["POST"])
-def score_pronunciation():
-    if "audio" not in request.files:
-        return jsonify({"score": 0, "error": "audio_required"}), 400
-    expected_word = request.form.get("word", "").strip().lower()
-    if not expected_word:
-        return jsonify({"score": 0, "error": "word_required"}), 400
-    audio_file = request.files["audio"]
-    temp_wav = None
-    try:
-        temp_wav = convert_to_wav_temp(audio_file)
-        # Transcribe the audio and get spoken text
-        spoken_text = transcribe_audio(temp_wav)
-        spoken_text = normalize(spoken_text)
-        # If no speech detected
-        if not spoken_text:
-            vid = static_video_for("silence")
-            return jsonify({
-                "score": 0,
-                "error": "no_asr_text",
-                "message": "No speech detected.",
-                "hint": vid["hint"],
-                "videoId": vid["videoId"],
-                "videoUrl": vid["videoUrl"],
-                "expected": expected_word,
-                "heard": ""
-            }), 200
-        # If multiple words detected
-        if len(spoken_text.split()) > 1:
-            vid = static_video_for("multipleword")
-            return jsonify({
-                "score": 0,
-                "error": "multiple_words",
-                "message": f"Detected multiple words: '{spoken_text}'. Please say only '{expected_word}'.",
-                "hint": vid["hint"],
-                "videoId": vid["videoId"],
-                "videoUrl": vid["videoUrl"],
-                "expected": expected_word,
-                "heard": spoken_text
-            }), 200
-        # Calculate phoneme similarity
-        expected_ph = expected_word  # Assuming expected word phoneme
-        spoken_ph = spoken_text  # Assuming spoken text phoneme
-        score = phoneme_similarity_score(expected_ph, spoken_ph)
-        # Success only when exact match and high score
-        if spoken_text == expected_word and score >= 90:
-            vid = static_video_for("success")
-            return jsonify({
-                "score": score,
-                "message": f"Excellent. You pronounced '{expected_word}' correctly.",
-                "hint": vid["hint"],
-                "videoId": vid["videoId"],
-                "videoUrl": vid["videoUrl"],
-                "expected": expected_word,
-                "heard": spoken_text
-            }), 200
-        # Phoneme mismatch -> provide targeted feedback for vowel, consonant, stress, or syllable
-        kind = select_video_for_vc(expected_ph, spoken_ph)
-        vid = static_video_for(kind)
-        return jsonify({
-            "score": score,
-            "message": "Good try. Some sounds need practice.",
-            "hint": vid["hint"],
-            "videoId": vid["videoId"],
-            "videoUrl": vid["videoUrl"],
-            "expected": expected_word,
-            "heard": spoken_text
-        }), 200
-    except Exception as e:
-        return jsonify({"score": 0, "error": "server_exception", "message": str(e)}), 500
-    finally:
-        if temp_wav:
-            try:
-                os.remove(temp_wav)
-            except Exception:
-                pass
-# IPA helpers and constants (adds split_ipa_tokens and related helpers)
-VOWELS = set("aeiouɪʊɛæɔɑəɜɒeɪoʊaɪɔɪ")  # extend with additional IPA symbols as needed
-PRIMARY_STRESS = "ˈ"
-SECONDARY_STRESS = "ˌ"
-IPA_DIGRAPHS = {"tʃ", "dʒ", "t͡ʃ", "d͡ʒ"}  # common multi-char IPA consonants
-def split_ipa_tokens(ipa: str):
-    """
-    Tokenize an IPA or simple-orthography string into a list of tokens.
-    - Preserves stress markers as separate tokens.
-    - Combines common digraphs (e.g. 'tʃ', 'dʒ').
-    - If input contains spaces, splits on words and tokenizes each chunk.
-    Works acceptably for plain words (will return characters) and basic IPA.
-    """
-    if not ipa:
-        return []
-    ipa = ipa.strip()
-    # If whitespace-separated, preserve word boundaries as contiguous tokens
-    if " " in ipa:
-        parts = []
-        for part in ipa.split():
-            parts.extend(_tokenize_chunk(part))
-        return parts
-    return _tokenize_chunk(ipa)
-def _tokenize_chunk(chunk: str):
-    tokens = []
-    i = 0
-    while i < len(chunk):
-        ch = chunk[i]
-        # stress markers
-        if ch in (PRIMARY_STRESS, SECONDARY_STRESS):
-            tokens.append(ch)
-            i += 1
-            continue
-        # try two-character digraphs first
-        if i + 1 < len(chunk):
-            pair = chunk[i : i + 2]
-            if pair in IPA_DIGRAPHS:
-                tokens.append(pair)
-                i += 2
-                continue
-        # fallback single character token
-        tokens.append(ch)
-        i += 1
-    return tokens
-def extract_vowel_sequence(ipa: str):
-    """Return concatenated vowel tokens in order (string)."""
-    tokens = split_ipa_tokens(ipa)
-    return "".join(t for t in tokens if t in VOWELS)
-def extract_consonant_tokens(tokens):
-    """Filter out vowels and stress markers from a tokens list, return consonant tokens list."""
-    return [t for t in tokens if t not in VOWELS and t not in (PRIMARY_STRESS, SECONDARY_STRESS) and t.strip()]
-def last_ending_token(tokens):
-    """Return the last non-stress, non-empty token (approx. final sound)."""
-    for t in reversed(tokens):
-        if not t or t in (PRIMARY_STRESS, SECONDARY_STRESS):
-            continue
-        return t
-    return None
-def primary_stress_position(tokens):
-    """
-    Return index of primary stress marker if present, otherwise None.
-    This is a coarse approximation used to compare stress positions between expected and spoken forms.
-    """
-    try:
-        return tokens.index(PRIMARY_STRESS)
-    except ValueError:
-        return None

ragg/app.py CHANGED Viewed

@@ -1,26 +1,24 @@
-import os
 import time
-import json
-import requests
-from dotenv import load_dotenv, find_dotenv
-from flask import Flask, Blueprint, request, jsonify, current_app, send_from_directory
-# Note: we avoid creating a Flask app at module import time
 import uuid
 from pathlib import Path
 from typing import Iterable, Optional, Sequence, Union
 from flask_cors import CORS
 import requests
 from TTS.api import TTS
-# --- S3 (added) ---
 try:
     import boto3
     from botocore.exceptions import NoCredentialsError, ClientError
 except Exception:
     boto3 = None
-    NoCredentialsError = ClientError = Exception  # fallbacks so type names exist
-# RAG imports
 try:
     from .rag_backend import IngestBody, ingest_documents, ingest_pdfs_from_folder
     from .rag_llm import (
@@ -29,12 +27,11 @@ try:
         ExplainBody,
         llm_explain,
         FollowupBody,
-        get_vectorstore,
-        get_vectorstore_for,  # ← add this
         llm_followups,
     )
 except ImportError:
-    # Fallback when running as: python ragg/app.py
     from rag_backend import IngestBody, ingest_documents, ingest_pdfs_from_folder
     from rag_llm import (
         LLMBody,
@@ -42,28 +39,80 @@ except ImportError:
         ExplainBody,
         llm_explain,
         FollowupBody,
-        get_vectorstore,
-        get_vectorstore_for,  # ← add this
         llm_followups,
     )
-# OpenAI client (no secret logs)
-import openai
 from openai import OpenAI
 def xtts_speak_to_file(
     text: str,
     out_file: Optional[Union[str, Path]] = None,
-    reference_dir: Optional[Union[str, Path]] = "trim",
     reference_files: Optional[Sequence[Union[str, Path]]] = None,
     language: str = "en",
     patterns: Iterable[str] = ("*.wav", "*.mp3", "*.flac"),
 ) -> Path:
-    """
-    Generate a WAV using XTTS v2 with reference audios; caches the model.
-    """
-    speakers: list[str] = []
     if reference_files:
         speakers.extend(str(Path(p)) for p in reference_files)
@@ -74,142 +123,55 @@ def xtts_speak_to_file(
     speakers = list(dict.fromkeys(speakers))
     if not speakers:
-        raise FileNotFoundError(
-            f"No reference audio files found. Checked: "
-            f"{reference_files or []} and/or {reference_dir}"
-        )
     if not hasattr(xtts_speak_to_file, "_model") or xtts_speak_to_file._model is None:
-        import sys, builtins, torch
-        from torch.serialization import add_safe_globals
-        # --- XTTS internal classes that must be allow-listed ---
-        from TTS.tts.configs.xtts_config import XttsConfig
-        from TTS.tts.models.xtts import XttsAudioConfig, XttsArgs
-        from TTS.config.shared_configs import BaseDatasetConfig
-        # Prevent interactive prompts / stdin crashes on Hugging Face
         sys.stdin = open(os.devnull)
         builtins.input = lambda *a, **kw: ""
         os.environ["COQUI_TOS_AGREED"] = "1"
-        # Allowlist all required XTTS classes for PyTorch 2.6+
-        add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
-        # Initialize the XTTS model safely
         xtts_speak_to_file._model = TTS(
             model_name="tts_models/multilingual/multi-dataset/xtts_v2",
             gpu=False,
             progress_bar=False,
         )
-    tts = xtts_speak_to_file._model
     out_path = Path(out_file) if out_file else Path(f"xtts_{uuid.uuid4().hex}.wav")
     out_path.parent.mkdir(parents=True, exist_ok=True)
     try:
-        tts.tts_to_file(
-            text=text,
-            speaker_wav=speakers,
-            language=language,
-            file_path=str(out_path),
-        )
     except Exception as e:
         raise RuntimeError(f"XTTS synthesis failed: {e}") from e
     return out_path
-# ------------------------------------------------------------
-# Load environment
-# ------------------------------------------------------------
-load_dotenv(find_dotenv())
-openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-# Optional: version log (safe), but do NOT print the API key
-try:
-    print(f"openai package version: {openai.__version__}")
-except Exception:
-    pass
-# --- S3 config (added) ---
-S3_BUCKET = os.getenv("S3_BUCKET", "").strip()
-AWS_REGION = os.getenv("AWS_REGION", "ap-south-1").strip()
-S3_PREFIX = os.getenv("S3_PREFIX", "audio/").strip()
-AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", "").strip()
-AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", "").strip()
-_s3_client = None
-if boto3 and S3_BUCKET and AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY:
-    try:
-        _s3_client = boto3.client(
-            "s3",
-            region_name=AWS_REGION,
-            aws_access_key_id=AWS_ACCESS_KEY_ID,
-            aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
-        )
-    except Exception as _e:
-        _s3_client = None
-def _upload_to_s3(file_path: Union[str, Path]) -> Optional[str]:
-    """
-    Upload the file to S3 and return a presigned URL (24h).
-    If S3 is not configured, returns None (caller will fallback).
-    """
-    if not _s3_client or not S3_BUCKET:
-        return None
-    try:
-        file_path = str(file_path)
-        key = f"{S3_PREFIX}{Path(file_path).name}"
-        _s3_client.upload_file(file_path, S3_BUCKET, key)
-        url = _s3_client.generate_presigned_url(
-            "get_object",
-            Params={"Bucket": S3_BUCKET, "Key": key},
-            ExpiresIn=24 * 3600,
-        )
-        return url
-    except (NoCredentialsError, ClientError) as e:
-        try:
-            current_app.logger.error(f"S3 upload failed: {e}")
-        except Exception:
-            print(f"S3 upload failed: {e}")
-        return None
-# Media and voice references
-# MEDIA_ROOT = Path(os.getenv("MEDIA_ROOT", "./media"))
-# AUDIO_DIR = MEDIA_ROOT / "audio"
-# AUDIO_DIR.mkdir(parents=True, exist_ok=True)
-# XTTS_REF_DIR = os.getenv("XTTS_REF_DIR", "./trim")  # folder with your reference audios
-BASE_DIR = Path(__file__).resolve().parent.parent  # if app.py is top-level; if it's ragg/app.py use .parent.parent
-MEDIA_ROOT = Path(os.getenv("MEDIA_ROOT", str(BASE_DIR / "media")))
-AUDIO_DIR = MEDIA_ROOT / "audio"
-AUDIO_DIR.mkdir(parents=True, exist_ok=True)
-XTTS_REF_DIR = os.getenv("XTTS_REF_DIR", str(BASE_DIR / "trim"))  # reference voice files
-# D-ID config (optional)
-# ------------------------------------------------------------
-# Blueprint (mounted at /rag by the main app)
-# ------------------------------------------------------------
-rag_bp = Blueprint("rag", __name__)
 @rag_bp.route("/audio/<path:filename>", methods=["GET"])
 def rag_serve_audio(filename: str):
-    return send_from_directory(AUDIO_DIR, filename, mimetype="audio/wav", conditional=True)
-# D-ID config (set in .env / HF Secrets)
-DID_API_KEY = os.getenv("DID_API_KEY", "")
-DID_SOURCE_IMAGE_URL = os.getenv("DID_SOURCE_IMAGE_URL", "")
-DID_VOICE_ID = os.getenv("DID_VOICE_ID", "en-US-JennyNeural")
-# Default folder for /ingest-pdfs
-PDF_DEFAULT_FOLDER = os.getenv("RAG_PDF_DIR", "./pdfs")
-# Optional: add CORS headers (the main app should still enable CORS globally)
 @rag_bp.after_app_request
 def add_cors_headers(resp):
     origin = request.headers.get("Origin")
-    # Allow local Angular during dev; main app may add more origins
     if origin in ("http://localhost:4200", "http://127.0.0.1:4200"):
         resp.headers["Access-Control-Allow-Origin"] = origin
         resp.headers["Vary"] = "Origin"
@@ -218,23 +180,14 @@ def add_cors_headers(resp):
     return resp
-# ------------------------------------------------------------
-# Helpers
-# ------------------------------------------------------------
-def user_to_db_level(username: str | None) -> str | None:
     if not username:
         return None
     u = username.strip().lower()
-    if u == "lowergrade":
-        return "low"
-    if u == "midgrade":
-        return "mid"
-    if u == "highergrade":
-        return "high"
-    return None
-def extract_username_from_request(req) -> str | None:
     hdr = req.headers.get("X-User")
     if hdr:
         return hdr
@@ -242,7 +195,7 @@ def extract_username_from_request(req) -> str | None:
     return data.get("username")
-# --- D-ID helpers ---
 def _did_create_talk(text: str):
     if not DID_API_KEY:
         return None, ("DID_API_KEY not set on the server", 500)
@@ -250,11 +203,7 @@ def _did_create_talk(text: str):
         return None, ("DID_SOURCE_IMAGE_URL not set on the server", 500)
     payload = {
-        "script": {
-            "type": "text",
-            "input": text,
-            "provider": {"type": "microsoft", "voice_id": DID_VOICE_ID},
-        },
         "source_url": DID_SOURCE_IMAGE_URL,
         "config": {"fluent": True, "pad_audio": 0},
     }
@@ -292,16 +241,65 @@ def _did_poll_talk(talk_id: str, timeout_sec: int = 60, interval_sec: float = 2.
         return None, ("D-ID poll failed", 502)
-# ------------------------------------------------------------
-# Endpoints (NOTE: no "/rag" prefix here; the blueprint adds it)
-# ------------------------------------------------------------
 @rag_bp.route("/ingest", methods=["POST", "OPTIONS"])
 def rag_ingest():
     if request.method == "OPTIONS":
         return ("", 204)
     body = IngestBody(**(request.json or {}))
-    result = ingest_documents(body)
-    return jsonify(result)
 @rag_bp.route("/ingest-pdfs", methods=["POST", "OPTIONS"])
@@ -310,11 +308,7 @@ def rag_ingest_pdfs():
         return ("", 204)
     data = request.json or {}
     folder = data.get("folder", PDF_DEFAULT_FOLDER)
-    subject = data.get("subject")
-    grade = data.get("grade")
-    chapter = data.get("chapter")
-    result = ingest_pdfs_from_folder(folder, subject=subject, grade=grade, chapter=chapter)
-    return jsonify(result)
 @rag_bp.route("/generate-questions", methods=["POST", "OPTIONS"])
@@ -327,145 +321,26 @@ def rag_generate_questions():
     if not data.get("db_level"):
         data["db_level"] = mapped_level
     body = LLMBody(**data)
-    result = llm_generate(body)
-    return jsonify(result)
-# @rag_bp.route("/explain-grammar", methods=["POST", "OPTIONS"])
-# @rag_bp.route("/explain-grammar", methods=["POST", "OPTIONS"])
-# def rag_explain_grammar():
-#     if request.method == "OPTIONS":
-#         return ("", 204)
-#     data = request.get_json(force=True) or {}
-#     # --- Extract username and db_level ---
-#     username = extract_username_from_request(request)
-#     db_level = user_to_db_level(username)
-#     # --- MAIN BODY (your preferred structure) ---
-#     body = ExplainBody(
-#         question=(data.get("question") or "").strip(),
-#         model=data.get("model", "gpt-4o-mini"),
-#         db_level=db_level,
-#         source_ids=data.get("source_ids") or []
-#     )
-#     # --- 1) Run LLM / RAG explanation ---
-#     result_raw = llm_explain(body)
-#     # --- 2) Normalize + extract answer safely ---
-#     result_dict = None
-#     answer_text = ""
-#     try:
-#         if isinstance(result_raw, dict):
-#             result_dict = dict(result_raw)
-#         elif hasattr(result_raw, "model_dump"):
-#             result_dict = result_raw.model_dump()
-#         elif hasattr(result_raw, "dict"):
-#             result_dict = result_raw.dict()
-#         elif isinstance(result_raw, str):
-#             result_dict = {"answer": result_raw}
-#         else:
-#             result_dict = {"answer": str(result_raw)}
-#         answer_text = (
-#             result_dict.get("answer")
-#             or result_dict.get("response")
-#             or result_dict.get("text")
-#             or ""
-#         ).strip()
-#     except Exception as e:
-#         current_app.logger.exception("Failed to normalize llm_explain result: %s", e)
-#         return jsonify({"error": "Internal error normalizing LLM response"}), 500
-#     # --- 3) Optional: synthesize TTS audio ---
-#     try:
-#         if data.get("synthesize_audio"):
-#             try:
-#                 out_name = f"explain_{uuid.uuid4().hex}.wav"
-#                 wav_path = xtts_speak_to_file(
-#                     text=answer_text or result_dict.get("answer", ""),
-#                     out_file=AUDIO_DIR / out_name,
-#                     reference_dir=XTTS_REF_DIR,
-#                     reference_files=None,
-#                     language=data.get("language", "en"),
-#                 )
-#                 # Local: serve from /rag/audio/*
-#                 if "localhost" in request.host_url or "127.0.0.1" in request.host_url:
-#                     base = request.host_url.rstrip("/")
-#                     result_dict["audio_url"] = f"{base}/rag/audio/{wav_path.name}"
-#                 else:
-#                     # Deployed: try S3 first; fallback to public SPACE_URL if set
-#                     s3_url = _upload_to_s3(str(wav_path))
-#                     if s3_url:
-#                         result_dict["audio_url"] = s3_url
-#                     else:
-#                         base = os.getenv("SPACE_URL", "https://pykara-py-learn-backend.hf.space")
-#                         result_dict["audio_url"] = f"{base}/rag/audio/{wav_path.name}"
-#             except FileNotFoundError as e:
-#                 current_app.logger.error("XTTS reference audio missing: %s", e)
-#             except Exception as e:
-#                 current_app.logger.exception("XTTS synthesis during explain-grammar failed: %s", e)
-#     except Exception:
-#         current_app.logger.exception("Unexpected error while attempting inline synthesis")
-#     # --- 4) Optional: synthesize video (D-ID) ---
-#     try:
-#         if data.get("synthesize_video"):
-#             if not DID_API_KEY or not DID_SOURCE_IMAGE_URL:
-#                 current_app.logger.error("D-ID not configured for inline explain-grammar video synthesis")
-#             else:
-#                 try:
-#                     talk_id, err = _did_create_talk(answer_text or result_dict.get("answer", ""))
-#                     if err:
-#                         current_app.logger.error(
-#                             "D-ID create error during explain-grammar: %s",
-#                             err[0] if isinstance(err, tuple) else err,
-#                         )
-#                     else:
-#                         video_url, err = _did_poll_talk(talk_id, timeout_sec=120, interval_sec=2.0)
-#                         if err:
-#                             current_app.logger.error(
-#                                 "D-ID poll error during explain-grammar: %s",
-#                                 err[0] if isinstance(err, tuple) else err,
-#                             )
-#                         else:
-#                             if video_url:
-#                                 result_dict["video_url"] = video_url
-#                 except Exception as e:
-#                     current_app.logger.exception("D-ID inline synthesis failed during explain-grammar: %s", e)
-#     except Exception:
-#         current_app.logger.exception("Unexpected error while attempting inline video synthesis")
-#     # --- Final response ---
-#     return jsonify(result_dict), 200
 @rag_bp.route("/explain-grammar", methods=["POST", "OPTIONS"])
 def rag_explain_grammar():
     if request.method == "OPTIONS":
         return ("", 204)
     data = request.get_json(force=True) or {}
-    # --- Extract username and db_level ---
     username = extract_username_from_request(request)
-    db_level = user_to_db_level(username)
-    # --- MAIN BODY (your preferred structure) ---
     body = ExplainBody(
         question=(data.get("question") or "").strip(),
         model=data.get("model", "gpt-4o-mini"),
-        db_level=db_level,
-        source_ids=data.get("source_ids") or []
     )
-    # --- 1) Run LLM / RAG explanation ---
     result_raw = llm_explain(body)
-    # --- 2) Normalize + extract answer safely ---
-    result_dict = None
-    answer_text = ""
     try:
         if isinstance(result_raw, dict):
             result_dict = dict(result_raw)
@@ -477,177 +352,144 @@ def rag_explain_grammar():
             result_dict = {"answer": result_raw}
         else:
             result_dict = {"answer": str(result_raw)}
-        answer_text = (
-            result_dict.get("answer")
-            or result_dict.get("response")
-            or result_dict.get("text")
-            or ""
-        ).strip()
     except Exception as e:
         current_app.logger.exception("Failed to normalize llm_explain result: %s", e)
         return jsonify({"error": "Internal error normalizing LLM response"}), 500
-    # --- 3) Optional: synthesize TTS audio ---
-    try:
-        if data.get("synthesize_audio"):
             try:
-                out_name = f"explain_{uuid.uuid4().hex}.wav"
-                wav_path = xtts_speak_to_file(
-                    text=answer_text or result_dict.get("answer", ""),
-                    out_file=AUDIO_DIR / out_name,
-                    reference_dir=XTTS_REF_DIR,
-                    reference_files=None,
-                    language=data.get("language", "en"),
-                )
-                base = request.host_url.rstrip("/")
-                result_dict["audio_url"] = f"{base}/rag/audio/{wav_path.name}"
-            except FileNotFoundError as e:
-                current_app.logger.error("XTTS reference audio missing: %s", e)
             except Exception as e:
-                current_app.logger.exception("XTTS synthesis during explain-grammar failed: %s", e)
-    except Exception:
-        current_app.logger.exception("Unexpected error while attempting inline synthesis")
-    # --- 4) Optional: synthesize video (D-ID) ---
-    try:
-        if data.get("synthesize_video"):
             if not DID_API_KEY or not DID_SOURCE_IMAGE_URL:
                 current_app.logger.error("D-ID not configured for inline explain-grammar video synthesis")
             else:
                 try:
                     talk_id, err = _did_create_talk(answer_text or result_dict.get("answer", ""))
                     if err:
-                        current_app.logger.error(
-                            "D-ID create error during explain-grammar: %s",
-                            err[0] if isinstance(err, tuple) else err,
-                        )
                     else:
                         video_url, err = _did_poll_talk(talk_id, timeout_sec=120, interval_sec=2.0)
                         if err:
-                            current_app.logger.error(
-                                "D-ID poll error during explain-grammar: %s",
-                                err[0] if isinstance(err, tuple) else err,
-                            )
-                        else:
-                            if video_url:
-                                result_dict["video_url"] = video_url
                 except Exception as e:
                     current_app.logger.exception("D-ID inline synthesis failed during explain-grammar: %s", e)
-    except Exception:
-        current_app.logger.exception("Unexpected error while attempting inline video synthesis")
-    # --- Final response ---
     return jsonify(result_dict), 200
-# @rag_bp.route("/suggest-followups", methods=["POST", "OPTIONS"])
 @rag_bp.route("/suggest-followups", methods=["POST", "OPTIONS"])
 def rag_suggest_followups():
     if request.method == "OPTIONS":
         return ("", 204)
     data = request.get_json(force=True) or {}
     username = extract_username_from_request(request)
-    db_level = user_to_db_level(username)
     body = FollowupBody(
         last_question=(data.get("last_question") or "").strip(),
         last_answer=(data.get("last_answer") or "").strip(),
         n=int(data.get("n", 5)),
         model=data.get("model", "gpt-4o-mini"),
-        db_level=db_level,
-        source_ids=data.get("source_ids") or []   # ← same addition here
     )
-    result = llm_followups(body)
-    return jsonify(result)
-# @rag_bp.get("/_diag")
 @rag_bp.get("/_diag")
 def rag_diag():
     try:
-        from .rag_llm import CHROMA_DIR, CHROMA_ROOT, get_vectorstore, get_vectorstore_for
     except ImportError:
-        from rag_llm import CHROMA_DIR, CHROMA_ROOT, get_vectorstore, get_vectorstore_for
-    import os
-    from flask import jsonify
     def _count(vs):
-        """Handle both LangChain and chromadb client objects."""
         if vs is None:
             return None
-        # 1️⃣ chromadb.Collection (your new get_vectorstore_for)
         if hasattr(vs, "count") and callable(vs.count):
             try:
                 return vs.count()
             except Exception:
                 return None
-        # 2️⃣ LangChain vectorstore
         if hasattr(vs, "_collection"):
             try:
-                return vs._collection.count()  # type: ignore
             except Exception:
                 try:
-                    return vs._client.get_collection(vs._collection.name).count()  # type: ignore
                 except Exception:
                     return None
         return None
-    # load each level safely
-    low_vs = get_vectorstore_for("low")
-    mid_vs = get_vectorstore_for("mid")
-    high_vs = get_vectorstore_for("high")
     info = {
-        "env_seen": {
-            "CHROMA_DIR": CHROMA_DIR,
-            "CHROMA_ROOT": CHROMA_ROOT
-        },
-        "low_dir": {
-            "path": os.path.join(CHROMA_ROOT, "low"),
-            "exists": os.path.isdir(os.path.join(CHROMA_ROOT, "low")),
-        },
-        "counts_default": _count(get_vectorstore()),
         "counts_low": _count(low_vs),
         "counts_mid": _count(mid_vs),
         "counts_high": _count(high_vs),
     }
     return jsonify(info), 200
-# def rag_diag():
-#     # minimal imports here to avoid circulars
-#     try:
-#         from .rag_llm import CHROMA_DIR, CHROMA_ROOT, get_vectorstore, get_vectorstore_for
-#     except ImportError:
-#         from rag_llm import CHROMA_DIR, CHROMA_ROOT, get_vectorstore, get_vectorstore_for
-#
-#     import os
-#     from flask import jsonify
-#
-#     def _count(vs):
-#         try:
-#             return vs._collection.count()
-#         except Exception:
-#             try:
-#                 return vs._client.get_collection(vs._collection.name).count()
-#             except Exception:
-#                 return None
-#
-#     info = {
-#         "env_seen": {"CHROMA_DIR": CHROMA_DIR, "CHROMA_ROOT": CHROMA_ROOT},
-#         "low_dir": {
-#             "path": os.path.join(CHROMA_ROOT, "low"),
-#             "exists": os.path.isdir(os.path.join(CHROMA_ROOT, "low")),
-#         },
-#         "counts_default": _count(get_vectorstore()),
-#         "counts_low": _count(get_vectorstore_for("low")),
-#         "counts_mid": _count(get_vectorstore_for("mid")),
-#         "counts_high": _count(get_vectorstore_for("high")),
-#     }
-#     return jsonify(info), 200
 @rag_bp.route("/search", methods=["POST", "OPTIONS"])
 def rag_search():
@@ -657,72 +499,42 @@ def rag_search():
     q = (data.get("q") or "").strip()
     if not q:
         return jsonify({"results": []})
-    # derive db_level from login, unless explicitly provided
     username = extract_username_from_request(request)
-    mapped_level = user_to_db_level(username)
-    db_level = data.get("db_level") or mapped_level
     vs = get_vectorstore_for(db_level)
     hits = vs.similarity_search_with_score(q, k=5)
     out = []
     for doc, dist in hits:
-        out.append({
-            "distance": float(dist),
-            "snippet": doc.page_content[:200],
-            "source_path": os.path.normpath(doc.metadata.get("source_path", "")),
-            "page": doc.metadata.get("page_1based"),
-        })
     return jsonify({"results": out})
-def generate_questions_from_vectorstore():
     try:
         vectorstore = get_vectorstore()
         query_text = "important content related to grammar"
         results = vectorstore.similarity_search_with_score(query_text, k=5)
-        print(f"Vectorstore query returned {len(results)} results")
         content = "\n".join([doc.page_content for doc, _ in results])
-        print(f"Retrieved content: {content[:500]}...")
         if not content:
-            return {"error": "No content retrieved from vectorstore. Please ingest PDFs first."}
         prompt = f"Generate 5 important questions based on the following content: {content}"
         response = openai_client.chat.completions.create(
-            model="gpt-4o-mini",
-            messages=[{"role": "user", "content": prompt}],
-            temperature=0.7,
-            max_tokens=150,
         )
-        response_text = response.choices[0].message.content.strip()
-        print(f"Processed OpenAI response: {response_text}")
-        return response_text
     except Exception as e:
-        print(f"Error during OpenAI API call: {e}")
-        return {"error": f"Failed to call OpenAI: {str(e)}"}
-@rag_bp.route("/generate-questions-from-chroma", methods=["POST", "OPTIONS"])
-def generate_questions_from_chroma():
-    def _generate_questions_from_vectorstore():
-        try:
-            vectorstore = get_vectorstore()
-            query_text = "important content related to grammar"
-            results = vectorstore.similarity_search_with_score(query_text, k=5)
-            content = "\n".join([doc.page_content for doc, _ in results])
-            if not content:
-                return {"error": "No content retrieved from vectorstore. Please ingest PDFs first."}
-            prompt = f"Generate 5 important questions based on the following content: {content}"
-            response = openai_client.chat.completions.create(
-                model="gpt-4o-mini",
-                messages=[{"role": "user", "content": prompt}],
-                temperature=0.7,
-                max_tokens=150,
-            )
-            return response.choices[0].message.content.strip()
-        except Exception as e:
-            return {"error": f"Failed to call OpenAI: {str(e)}"}
-    generated = _generate_questions_from_vectorstore()
     return jsonify({"generated_questions": generated})
@@ -730,115 +542,111 @@ def generate_questions_from_chroma():
 def health():
     return {"status": "ok"}, 200
 @rag_bp.route("/synthesize-audio", methods=["POST", "OPTIONS"])
 def rag_synthesize_audio():
-    """
-    Synthesize text to WAV on demand using XTTS and return a public URL.
-    Body: { "text": "...", "language": "en", "reference_files": ["trim/foo.wav", ...] }
-    """
     if request.method == "OPTIONS":
         return ("", 204)
     data = request.get_json(force=True) or {}
     text = (data.get("text") or "").strip()
     if not text:
         return jsonify({"error": "No text provided"}), 400
-    language = data.get("language", "en")
-    reference_files = data.get("reference_files")  # optional list of paths
     try:
         out_name = f"synth_{uuid.uuid4().hex}.wav"
         wav_path = xtts_speak_to_file(
-            text=text,
-            out_file=AUDIO_DIR / out_name,
-            reference_dir=XTTS_REF_DIR,
-            reference_files=reference_files,
-            language=language,
         )
-        # Local: serve static file
         if "localhost" in request.host_url or "127.0.0.1" in request.host_url:
             base = request.host_url.rstrip("/")
             audio_url = f"{base}/rag/audio/{wav_path.name}"
         else:
-            # Deployed: try S3 first; fallback to SPACE_URL
             s3_url = _upload_to_s3(str(wav_path))
             if s3_url:
                 audio_url = s3_url
             else:
-                base = os.getenv("SPACE_URL", "https://pykara-py-learn-backend.hf.space")
                 audio_url = f"{base}/rag/audio/{wav_path.name}"
         return jsonify({"audio_url": audio_url, "file": wav_path.name}), 200
-    except Exception as e:
-        import traceback
-        print("=== XTTS DEBUG ERROR ===")
-        print(traceback.format_exc())
-        print("========================")
-        return jsonify({"error": "Synthesis failed", "detail": str(e)}), 500
-    # except FileNotFoundError as e:
-    #     current_app.logger.error("XTTS references missing: %s", e)
-    #     return jsonify({"error": "XTTS reference audio files not found on server"}), 500
     except Exception as e:
         current_app.logger.exception("XTTS synthesis error: %s", e)
-        return jsonify({"error": "Synthesis failed"}), 500
 @rag_bp.route("/synthesize-video", methods=["POST", "OPTIONS"])
 def rag_synthesize_video():
-    """
-    Synthesize a short video on-demand using the D-ID service and return the public video URL.
-    Body: { "text": "..." }
-    """
     if request.method == "OPTIONS":
         return ("", 204)
     data = request.get_json(force=True) or {}
     text = (data.get("text") or "").strip()
     if not text:
         return jsonify({"error": "No text provided"}), 400
-    # Quick config check
     if not DID_API_KEY or not DID_SOURCE_IMAGE_URL:
         current_app.logger.error("D-ID not configured (DID_API_KEY or DID_SOURCE_IMAGE_URL missing)")
         return jsonify({"error": "D-ID not configured on server"}), 500
     try:
-        # Create talk (calls D-ID /talks)
         talk_id, err = _did_create_talk(text)
         if err:
-            # _did_create_talk returns (None, (msg, status))
-            current_app.logger.error("D-ID create error: %s", err[0])
             return jsonify({"error": err[0]}), err[1]
-        # Poll for result URL
         video_url, err = _did_poll_talk(talk_id, timeout_sec=120, interval_sec=2.0)
         if err:
-            current_app.logger.error("D-ID poll error: %s", err[0])
             return jsonify({"error": err[0]}), err[1]
         if not video_url:
-            current_app.logger.error("D-ID did not return a video URL for talk %s", talk_id)
             return jsonify({"error": "D-ID did not return a video URL"}), 502
         return jsonify({"video_url": video_url}), 200
     except Exception as e:
         current_app.logger.exception("Unexpected error generating D-ID video: %s", e)
         return jsonify({"error": "Internal server error generating video"}), 500
-# ------------------------------------------------------------
-# Local runner (DEV ONLY)
-# ------------------------------------------------------------
-if __name__ == "__main__":
-    # Allow this module to run as a standalone server on port 7000 for local dev
-    from flask import Flask
-    from flask_cors import CORS
-    app = Flask(__name__)
-    # CORS for local dev (the production app sets CORS globally in verification.py)
     CORS(
         app,
         resources={r"/rag/*": {"origins": ["http://localhost:4200", "http://127.0.0.1:4200"]}},
@@ -846,10 +654,6 @@ if __name__ == "__main__":
         allow_headers=["Content-Type", "Authorization", "X-User"],
         methods=["GET", "POST", "OPTIONS"],
     )
-    # Ensure Chroma dir exists (use CHROMA_DIR if set)
     os.makedirs(os.getenv("CHROMA_DIR", "./chroma"), exist_ok=True)
-    # Mount blueprint at /rag and run
     app.register_blueprint(rag_bp, url_prefix="/rag")
-    app.run(host="0.0.0.0", port=7000, debug=True)

+import os
 import time
 import uuid
 from pathlib import Path
 from typing import Iterable, Optional, Sequence, Union
+from dotenv import load_dotenv, find_dotenv
+from flask import Flask, Blueprint, request, jsonify, current_app, send_from_directory
 from flask_cors import CORS
 import requests
 from TTS.api import TTS
 try:
     import boto3
     from botocore.exceptions import NoCredentialsError, ClientError
 except Exception:
     boto3 = None
+    NoCredentialsError = ClientError = Exception
+# local imports (support running as a package or module)
 try:
     from .rag_backend import IngestBody, ingest_documents, ingest_pdfs_from_folder
     from .rag_llm import (
         ExplainBody,
         llm_explain,
         FollowupBody,
         llm_followups,
+        get_vectorstore,
+        get_vectorstore_for,
     )
 except ImportError:
     from rag_backend import IngestBody, ingest_documents, ingest_pdfs_from_folder
     from rag_llm import (
         LLMBody,
         ExplainBody,
         llm_explain,
         FollowupBody,
         llm_followups,
+        get_vectorstore,
+        get_vectorstore_for,
     )
 from openai import OpenAI
+load_dotenv(find_dotenv())
+openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY", ""))
+# Configuration
+S3_BUCKET = os.getenv("S3_BUCKET", "").strip()
+AWS_REGION = os.getenv("AWS_REGION", "ap-south-1").strip()
+S3_PREFIX = os.getenv("S3_PREFIX", "audio/").strip()
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", "").strip()
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", "").strip()
+BASE_DIR = Path(__file__).resolve().parent.parent
+MEDIA_ROOT = Path(os.getenv("MEDIA_ROOT", str(BASE_DIR / "media")))
+AUDIO_DIR = MEDIA_ROOT / "audio"
+AUDIO_DIR.mkdir(parents=True, exist_ok=True)
+XTTS_REF_DIR = Path(os.getenv("XTTS_REF_DIR", str(BASE_DIR / "assets")))
+DID_API_KEY = os.getenv("DID_API_KEY", "")
+DID_SOURCE_IMAGE_URL = os.getenv("DID_SOURCE_IMAGE_URL", "")
+DID_VOICE_ID = os.getenv("DID_VOICE_ID", "en-US-JennyNeural")
+PDF_DEFAULT_FOLDER = os.getenv("RAG_PDF_DIR", "../assets/pdfs")
+# init optional s3 client
+_s3_client = None
+if boto3 and S3_BUCKET and AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY:
+    try:
+        _s3_client = boto3.client(
+            "s3",
+            region_name=AWS_REGION,
+            aws_access_key_id=AWS_ACCESS_KEY_ID,
+            aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+        )
+    except Exception:
+        _s3_client = None
+rag_bp = Blueprint("rag", __name__)
+REMOTE_API_URL = "https://kw6j9hcwmljvpa-5000.proxy.runpod.net/generate"
+def _upload_to_s3(file_path: Union[str, Path]) -> Optional[str]:
+    """Upload file to S3 and return presigned URL or None."""
+    if not _s3_client or not S3_BUCKET:
+        return None
+    try:
+        file_path = str(file_path)
+        key = f"{S3_PREFIX}{Path(file_path).name}"
+        _s3_client.upload_file(file_path, S3_BUCKET, key)
+        return _s3_client.generate_presigned_url(
+            "get_object", Params={"Bucket": S3_BUCKET, "Key": key}, ExpiresIn=24 * 3600
+        )
+    except (NoCredentialsError, ClientError) as e:
+        try:
+            current_app.logger.error("S3 upload failed: %s", e)
+        except Exception:
+            print("S3 upload failed:", e)
+        return None
+# XTTS helper (lazy-initializes the Coqui model)
 def xtts_speak_to_file(
     text: str,
     out_file: Optional[Union[str, Path]] = None,
+    reference_dir: Optional[Union[str, Path]] = "assets",
     reference_files: Optional[Sequence[Union[str, Path]]] = None,
     language: str = "en",
     patterns: Iterable[str] = ("*.wav", "*.mp3", "*.flac"),
 ) -> Path:
+    speakers = []
     if reference_files:
         speakers.extend(str(Path(p)) for p in reference_files)
     speakers = list(dict.fromkeys(speakers))
     if not speakers:
+        raise FileNotFoundError(f"No reference audio files found: {reference_files or reference_dir}")
     if not hasattr(xtts_speak_to_file, "_model") or xtts_speak_to_file._model is None:
+        import sys, builtins
         sys.stdin = open(os.devnull)
         builtins.input = lambda *a, **kw: ""
         os.environ["COQUI_TOS_AGREED"] = "1"
+        # Best-effort registration for safe globals (if available)
+        try:
+            from TTS.tts.configs.xtts_config import XttsConfig
+            from TTS.tts.models.xtts import XttsAudioConfig, XttsArgs
+            from TTS.config.shared_configs import BaseDatasetConfig
+            import torch
+            add_safe = getattr(torch.serialization, "add_safe_globals", None)
+            if callable(add_safe):
+                add_safe([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
+        except Exception:
+            pass
         xtts_speak_to_file._model = TTS(
             model_name="tts_models/multilingual/multi-dataset/xtts_v2",
             gpu=False,
             progress_bar=False,
         )
+    tts = xtts_speak_to_file._model
     out_path = Path(out_file) if out_file else Path(f"xtts_{uuid.uuid4().hex}.wav")
     out_path.parent.mkdir(parents=True, exist_ok=True)
     try:
+        tts.tts_to_file(text=text, speaker_wav=speakers, language=language, file_path=str(out_path))
     except Exception as e:
         raise RuntimeError(f"XTTS synthesis failed: {e}") from e
     return out_path
+# Serve audio files from AUDIO_DIR
 @rag_bp.route("/audio/<path:filename>", methods=["GET"])
 def rag_serve_audio(filename: str):
+    return send_from_directory(str(AUDIO_DIR), filename, mimetype="audio/wav", conditional=True)
+# CORS for dev Angular origins
 @rag_bp.after_app_request
 def add_cors_headers(resp):
     origin = request.headers.get("Origin")
     if origin in ("http://localhost:4200", "http://127.0.0.1:4200"):
         resp.headers["Access-Control-Allow-Origin"] = origin
         resp.headers["Vary"] = "Origin"
     return resp
+def user_to_db_level(username: Optional[str]) -> Optional[str]:
     if not username:
         return None
     u = username.strip().lower()
+    return {"lowergrade": "low", "midgrade": "mid", "highergrade": "high"}.get(u)
+def extract_username_from_request(req) -> Optional[str]:
     hdr = req.headers.get("X-User")
     if hdr:
         return hdr
     return data.get("username")
+# D-ID helpers
 def _did_create_talk(text: str):
     if not DID_API_KEY:
         return None, ("DID_API_KEY not set on the server", 500)
         return None, ("DID_SOURCE_IMAGE_URL not set on the server", 500)
     payload = {
+        "script": {"type": "text", "input": text, "provider": {"type": "microsoft", "voice_id": DID_VOICE_ID}},
         "source_url": DID_SOURCE_IMAGE_URL,
         "config": {"fluent": True, "pad_audio": 0},
     }
         return None, ("D-ID poll failed", 502)
+# New helper: generate KD Talker video from text (returns (video_url, None) or (None, (msg, status)))
+def _generate_kd_video_from_text(text: str, language: str = "en"):
+    image_path = Path(os.getenv("VIDEO_IMAGE_PATH", str(BASE_DIR / 'assets' / 'teacher.png')))
+    if not image_path.exists():
+        return None, ("Image file not found", 404)
+    # 1) Synthesize audio from text -> save wav under AUDIO_DIR
+    try:
+        out_name = f"genvid_{uuid.uuid4().hex}.wav"
+        wav_path = xtts_speak_to_file(
+            text=text,
+            out_file=AUDIO_DIR / out_name,
+            reference_dir=XTTS_REF_DIR,
+            reference_files=None,
+            language=language
+        )
+    except FileNotFoundError as e:
+        current_app.logger.error("XTTS references missing: %s", e)
+        return None, ("XTTS reference audio files not found on server", 500)
+    except Exception as e:
+        current_app.logger.exception("XTTS synthesis failed: %s", e)
+        return None, ("Audio synthesis failed", 500)
+    # 2) Call GPU server with image + synthesized audio
+    try:
+        with image_path.open("rb") as img_file, Path(wav_path).open("rb") as audio_file:
+            files = {
+                "image": ("image", img_file),
+                "audio": ("audio", audio_file),
+            }
+            data_form = {"text": text}
+            response = requests.post(REMOTE_API_URL, files=files, data=data_form, timeout=120)
+        if response.status_code != 200:
+            return None, (f"GPU server error: {response.text}", 502)
+        # Expect JSON { "video_url": "..." }
+        try:
+            payload = response.json()
+            video_url = payload.get("video_url")
+            if not video_url:
+                return None, ("Video URL not found in GPU response", 502)
+            return video_url, None
+        except Exception as e:
+            current_app.logger.exception("GPU response parse failed: %s", e)
+            return None, ("Error parsing GPU response JSON", 500)
+    except Exception as e:
+        current_app.logger.exception("GPU server request failed: %s", e)
+        return None, ("GPU server request failed", 500)
+# Ingest endpoints
 @rag_bp.route("/ingest", methods=["POST", "OPTIONS"])
 def rag_ingest():
     if request.method == "OPTIONS":
         return ("", 204)
     body = IngestBody(**(request.json or {}))
+    return jsonify(ingest_documents(body))
 @rag_bp.route("/ingest-pdfs", methods=["POST", "OPTIONS"])
         return ("", 204)
     data = request.json or {}
     folder = data.get("folder", PDF_DEFAULT_FOLDER)
+    return jsonify(ingest_pdfs_from_folder(folder, subject=data.get("subject"), grade=data.get("grade"), chapter=data.get("chapter")))
 @rag_bp.route("/generate-questions", methods=["POST", "OPTIONS"])
     if not data.get("db_level"):
         data["db_level"] = mapped_level
     body = LLMBody(**data)
+    return jsonify(llm_generate(body))
 @rag_bp.route("/explain-grammar", methods=["POST", "OPTIONS"])
 def rag_explain_grammar():
     if request.method == "OPTIONS":
         return ("", 204)
     data = request.get_json(force=True) or {}
     username = extract_username_from_request(request)
     body = ExplainBody(
         question=(data.get("question") or "").strip(),
         model=data.get("model", "gpt-4o-mini"),
+        db_level=user_to_db_level(username),
+        source_ids=data.get("source_ids") or [],
     )
     result_raw = llm_explain(body)
+    # normalize result
     try:
         if isinstance(result_raw, dict):
             result_dict = dict(result_raw)
             result_dict = {"answer": result_raw}
         else:
             result_dict = {"answer": str(result_raw)}
     except Exception as e:
         current_app.logger.exception("Failed to normalize llm_explain result: %s", e)
         return jsonify({"error": "Internal error normalizing LLM response"}), 500
+    answer_text = (result_dict.get("answer") or result_dict.get("response") or result_dict.get("text") or "").strip()
+    # optional audio synthesis
+    if data.get("synthesize_audio"):
+        try:
+            out_name = f"explain_{uuid.uuid4().hex}.wav"
+            wav_path = xtts_speak_to_file(
+                text=answer_text or result_dict.get("answer", ""),
+                out_file=AUDIO_DIR / out_name,
+                reference_dir=XTTS_REF_DIR,
+                reference_files=None,
+                language=data.get("language", "en"),
+            )
+            base = request.host_url.rstrip("/")
+            result_dict["audio_url"] = f"{base}/rag/audio/{wav_path.name}"
+        except FileNotFoundError as e:
+            current_app.logger.error("XTTS reference audio missing: %s", e)
+        except Exception as e:
+            current_app.logger.exception("XTTS synthesis during explain-grammar failed: %s", e)
+    # optional video synthesis (D-ID or KD Talker)
+    if data.get("synthesize_video"):
+        # KD Talker path if frontend requested it (chatId === '2')
+        if data.get("kdtalker") or data.get("use_kdtalker"):
             try:
+                video_url, err = _generate_kd_video_from_text(answer_text or result_dict.get("answer", ""), data.get("language", "en"))
+                if err:
+                    try:
+                        current_app.logger.error("KD Talker create error during explain-grammar: %s", err[0] if isinstance(err, tuple) else err)
+                    except Exception:
+                        print("KD Talker error:", err)
+                elif video_url:
+                    result_dict["video_url"] = video_url
             except Exception as e:
+                current_app.logger.exception("KD Talker inline synthesis failed during explain-grammar: %s", e)
+        else:
+            # existing D-ID flow
             if not DID_API_KEY or not DID_SOURCE_IMAGE_URL:
                 current_app.logger.error("D-ID not configured for inline explain-grammar video synthesis")
             else:
                 try:
                     talk_id, err = _did_create_talk(answer_text or result_dict.get("answer", ""))
                     if err:
+                        current_app.logger.error("D-ID create error during explain-grammar: %s", err[0] if isinstance(err, tuple) else err)
                     else:
                         video_url, err = _did_poll_talk(talk_id, timeout_sec=120, interval_sec=2.0)
                         if err:
+                            current_app.logger.error("D-ID poll error during explain-grammar: %s", err[0] if isinstance(err, tuple) else err)
+                        elif video_url:
+                            result_dict["video_url"] = video_url
                 except Exception as e:
                     current_app.logger.exception("D-ID inline synthesis failed during explain-grammar: %s", e)
     return jsonify(result_dict), 200
 @rag_bp.route("/suggest-followups", methods=["POST", "OPTIONS"])
 def rag_suggest_followups():
     if request.method == "OPTIONS":
         return ("", 204)
     data = request.get_json(force=True) or {}
     username = extract_username_from_request(request)
     body = FollowupBody(
         last_question=(data.get("last_question") or "").strip(),
         last_answer=(data.get("last_answer") or "").strip(),
         n=int(data.get("n", 5)),
         model=data.get("model", "gpt-4o-mini"),
+        db_level=user_to_db_level(username),
+        source_ids=data.get("source_ids") or [],
     )
+    return jsonify(llm_followups(body))
 @rag_bp.get("/_diag")
 def rag_diag():
+    # Vectorstore diagnostics + media & routing checks
     try:
+        from .rag_llm import CHROMA_DIR, CHROMA_ROOT, get_vectorstore as gs, get_vectorstore_for as gvf
     except ImportError:
+        from rag_llm import CHROMA_DIR, CHROMA_ROOT, get_vectorstore as gs, get_vectorstore_for as gvf
     def _count(vs):
         if vs is None:
             return None
         if hasattr(vs, "count") and callable(vs.count):
             try:
                 return vs.count()
             except Exception:
                 return None
         if hasattr(vs, "_collection"):
             try:
+                return vs._collection.count()
             except Exception:
                 try:
+                    return vs._client.get_collection(vs._collection.name).count()
                 except Exception:
                     return None
         return None
+    low_vs = gvf("low")
+    mid_vs = gvf("mid")
+    high_vs = gvf("high")
+    # media checks
+    ref_dir_exists = XTTS_REF_DIR.exists() and XTTS_REF_DIR.is_dir()
+    ref_files = []
+    if ref_dir_exists:
+        for ext in ("*.wav", "*.mp3", "*.flac"):
+            ref_files.extend([str(p.name) for p in XTTS_REF_DIR.glob(ext)])
+    audio_dir_exists = AUDIO_DIR.exists() and AUDIO_DIR.is_dir()
+    audio_files = [p.name for p in AUDIO_DIR.glob("*.wav")] if audio_dir_exists else []
+    # list registered routes beginning with /rag
+    routes = [r.rule for r in current_app.url_map.iter_rules() if r.rule.startswith("/rag")]
     info = {
+        "env_seen": {"CHROMA_DIR": CHROMA_DIR, "CHROMA_ROOT": CHROMA_ROOT},
+        "low_dir": {"path": str(Path(CHROMA_ROOT) / "low"), "exists": Path(CHROMA_ROOT, "low").is_dir()},
+        "counts_default": _count(gs()),
         "counts_low": _count(low_vs),
         "counts_mid": _count(mid_vs),
         "counts_high": _count(high_vs),
+        "media": {
+            "xtts_ref_dir": str(XTTS_REF_DIR),
+            "xtts_ref_dir_exists": ref_dir_exists,
+            "xtts_ref_files_sample": ref_files[:10],
+            "audio_dir": str(AUDIO_DIR),
+            "audio_dir_exists": audio_dir_exists,
+            "audio_files_sample": audio_files[:20],
+        },
+        "routes": routes,
     }
     return jsonify(info), 200
 @rag_bp.route("/search", methods=["POST", "OPTIONS"])
 def rag_search():
     q = (data.get("q") or "").strip()
     if not q:
         return jsonify({"results": []})
     username = extract_username_from_request(request)
+    db_level = data.get("db_level") or user_to_db_level(username)
     vs = get_vectorstore_for(db_level)
     hits = vs.similarity_search_with_score(q, k=5)
     out = []
     for doc, dist in hits:
+        out.append(
+            {
+                "distance": float(dist),
+                "snippet": doc.page_content[:200],
+                "source_path": os.path.normpath(doc.metadata.get("source_path", "")),
+                "page": doc.metadata.get("page_1based"),
+            }
+        )
     return jsonify({"results": out})
+@rag_bp.route("/generate-questions-from-chroma", methods=["POST", "OPTIONS"])
+def generate_questions_from_chroma():
+    if request.method == "OPTIONS":
+        return ("", 204)
     try:
         vectorstore = get_vectorstore()
         query_text = "important content related to grammar"
         results = vectorstore.similarity_search_with_score(query_text, k=5)
         content = "\n".join([doc.page_content for doc, _ in results])
         if not content:
+            return jsonify({"error": "No content retrieved from vectorstore. Please ingest PDFs first."}), 200
         prompt = f"Generate 5 important questions based on the following content: {content}"
         response = openai_client.chat.completions.create(
+            model="gpt-4o-mini", messages=[{"role": "user", "content": prompt}], temperature=0.7, max_tokens=150
         )
+        generated = response.choices[0].message.content.strip()
     except Exception as e:
+        generated = {"error": f"Failed to call OpenAI: {str(e)}"}
     return jsonify({"generated_questions": generated})
 def health():
     return {"status": "ok"}, 200
 @rag_bp.route("/synthesize-audio", methods=["POST", "OPTIONS"])
 def rag_synthesize_audio():
     if request.method == "OPTIONS":
         return ("", 204)
     data = request.get_json(force=True) or {}
     text = (data.get("text") or "").strip()
     if not text:
         return jsonify({"error": "No text provided"}), 400
+    language = (data.get("language") or "en").strip()
+    reference_files = data.get("reference_files")
+    # preflight checks
+    try:
+        if not reference_files:
+            if not XTTS_REF_DIR.exists() or not XTTS_REF_DIR.is_dir():
+                current_app.logger.error("XTTS_REF_DIR not found: %s", XTTS_REF_DIR)
+                return jsonify({"error": "XTTS reference directory not found", "details": str(XTTS_REF_DIR)}), 500
+            has_any = any(XTTS_REF_DIR.glob("*.wav")) or any(XTTS_REF_DIR.glob("*.mp3")) or any(XTTS_REF_DIR.glob("*.flac"))
+            if not has_any:
+                current_app.logger.error("No reference audio files in XTTS_REF_DIR: %s", XTTS_REF_DIR)
+                return jsonify({"error": "XTTS reference audio files not found on server", "details": str(XTTS_REF_DIR)}), 500
+        else:
+            missing = [str(p) for p in reference_files if not Path(p).exists()]
+            if missing:
+                current_app.logger.error("Provided reference_files missing: %s", missing)
+                return jsonify({"error": "One or more reference_files not found", "details": missing}), 400
+    except Exception as pre_e:
+        current_app.logger.exception("Preflight validation failed: %s", pre_e)
+        return jsonify({"error": "Preflight validation failed", "details": str(pre_e)}), 500
     try:
         out_name = f"synth_{uuid.uuid4().hex}.wav"
         wav_path = xtts_speak_to_file(
+            text=text, out_file=AUDIO_DIR / out_name, reference_dir=XTTS_REF_DIR, reference_files=reference_files, language=language
         )
         if "localhost" in request.host_url or "127.0.0.1" in request.host_url:
             base = request.host_url.rstrip("/")
             audio_url = f"{base}/rag/audio/{wav_path.name}"
         else:
             s3_url = _upload_to_s3(str(wav_path))
             if s3_url:
                 audio_url = s3_url
             else:
+                base = os.getenv("SPACE_URL", "https://majemaai-mj-learn-backend.hf.space")
                 audio_url = f"{base}/rag/audio/{wav_path.name}"
         return jsonify({"audio_url": audio_url, "file": wav_path.name}), 200
+    except FileNotFoundError as e:
+        current_app.logger.error("XTTS references missing: %s", e)
+        return jsonify({"error": "XTTS reference audio files not found on server", "details": str(e)}), 500
     except Exception as e:
         current_app.logger.exception("XTTS synthesis error: %s", e)
+        return jsonify({"error": "Synthesis failed", "details": str(e)}), 500
 @rag_bp.route("/synthesize-video", methods=["POST", "OPTIONS"])
 def rag_synthesize_video():
     if request.method == "OPTIONS":
         return ("", 204)
     data = request.get_json(force=True) or {}
     text = (data.get("text") or "").strip()
     if not text:
         return jsonify({"error": "No text provided"}), 400
     if not DID_API_KEY or not DID_SOURCE_IMAGE_URL:
         current_app.logger.error("D-ID not configured (DID_API_KEY or DID_SOURCE_IMAGE_URL missing)")
         return jsonify({"error": "D-ID not configured on server"}), 500
     try:
         talk_id, err = _did_create_talk(text)
         if err:
             return jsonify({"error": err[0]}), err[1]
         video_url, err = _did_poll_talk(talk_id, timeout_sec=120, interval_sec=2.0)
         if err:
             return jsonify({"error": err[0]}), err[1]
         if not video_url:
             return jsonify({"error": "D-ID did not return a video URL"}), 502
         return jsonify({"video_url": video_url}), 200
     except Exception as e:
         current_app.logger.exception("Unexpected error generating D-ID video: %s", e)
         return jsonify({"error": "Internal server error generating video"}), 500
+@rag_bp.route("/generate-video-from-text", methods=["POST", "OPTIONS"])
+def generate_video_from_text():
+    if request.method == "OPTIONS":
+        return ("", 204)
+    data = request.get_json(force=True) or {}
+    text = (data.get("text") or "").strip()
+    if not text:
+        return jsonify({"error": "No text provided"}), 400
+    language = data.get("language", "en")
+    video_url, err = _generate_kd_video_from_text(text, language)
+    if err:
+        return jsonify({"error": err[0]}), err[1]
+    return jsonify({"video_url": video_url}), 200
+#KD Talker setup (helper already added above)
+if __name__ == "__main__":
+    app = Flask(__name__)
     CORS(
         app,
         resources={r"/rag/*": {"origins": ["http://localhost:4200", "http://127.0.0.1:4200"]}},
         allow_headers=["Content-Type", "Authorization", "X-User"],
         methods=["GET", "POST", "OPTIONS"],
     )
     os.makedirs(os.getenv("CHROMA_DIR", "./chroma"), exist_ok=True)
     app.register_blueprint(rag_bp, url_prefix="/rag")
+    app.run(host="0.0.0.0", port=7000, debug=True)

ragg/ingest_all.py CHANGED Viewed

@@ -18,9 +18,9 @@ IS_HF = bool(os.getenv("HF_HOME") or os.getenv("SPACE_ID"))
 HERE = Path(__file__).resolve().parent
 # PDF root auto-detect
-PDFS_ROOT = (HERE / "pdfs")
 if not PDFS_ROOT.is_dir():
-    PDFS_ROOT = (HERE.parent / "pdfs")  # Works for /app/pdfs/*
 # Chroma root auto-detect
 CHROMA_BASE = Path(os.getenv("CHROMA_ROOT") or ("/data/chroma" if IS_HF else "./chroma"))

 HERE = Path(__file__).resolve().parent
 # PDF root auto-detect
+PDFS_ROOT = (HERE / "assets" / "pdfs")
 if not PDFS_ROOT.is_dir():
+    PDFS_ROOT = (HERE.parent / "assets" / "pdfs")  # Works for /app/pdfs/*
 # Chroma root auto-detect
 CHROMA_BASE = Path(os.getenv("CHROMA_ROOT") or ("/data/chroma" if IS_HF else "./chroma"))

ragg/tts.py CHANGED Viewed

@@ -7,7 +7,7 @@ from TTS.api import TTS
 def xtts_speak_to_file(
     text: str,
     out_file: Optional[Union[str, Path]] = None,
-    reference_dir: Optional[Union[str, Path]] = "trim",
     reference_files: Optional[Sequence[Union[str, Path]]] = None,
     language: str = "en",
     patterns: Iterable[str] = ("*.wav", "*.mp3", "*.flac"),

 def xtts_speak_to_file(
     text: str,
     out_file: Optional[Union[str, Path]] = None,
+    reference_dir: Optional[Union[str, Path]] = "assets",
     reference_files: Optional[Sequence[Union[str, Path]]] = None,
     language: str = "en",
     patterns: Iterable[str] = ("*.wav", "*.mp3", "*.flac"),

reading.py DELETED Viewed

@@ -1,158 +0,0 @@
-from flask import Flask, Blueprint, request, jsonify, current_app
-import openai
-import random
-import os
-from flask_cors import CORS
-# --- Blueprint ---
-reading_bp = Blueprint("reading", __name__)
-# app = Flask(__name__)
-app = Flask(__name__)
-CORS(app)
-_OPENAI_API_KEY_FALLBACK = os.getenv("OPENAI_API_KEY", "")
-# Set up your OpenAI API key (replace this with your own API key)
-# openai.api_key = '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'  # Replace with your actual OpenAI API key
-def _ensure_openai_key():
-    """Set openai.api_key from app config or env before each API call."""
-    api_key = (current_app.config.get("OPENAI_API_KEY")
-               if current_app else None) or _OPENAI_API_KEY_FALLBACK
-    if api_key:
-        openai.api_key = api_key
-# Function to generate content dynamically based on the topic and difficulty level
-def generate_content(topic, difficulty):
-    _ensure_openai_key()
-    try:
-        # Define instructions based on difficulty level
-        if difficulty == "easy":
-            instruction = f"Write a very simple and basic explanation about {topic} for children aged 6-8. Use very simple words and short sentences."
-        elif difficulty == "medium":
-            instruction = f"Write a detailed and engaging explanation about {topic} for children aged 9-12. Use simple words but include more details."
-        else:  # Hard difficulty
-            instruction = f"Write an in-depth explanation about {topic} for children aged 13-16. Use more complex words and provide deeper insights into the topic."
-        # Call OpenAI API to generate the content
-        response = openai.chat.completions.create(
-            model="gpt-3.5-turbo",
-            messages=[
-                {"role": "system", "content": "You are a friendly teacher explaining concepts to students."},
-                {"role": "user", "content": instruction}
-            ],
-            max_tokens=700,
-            temperature=0.7
-        )
-        content = response.choices[0].message.content.strip()
-        return content
-    except Exception as e:
-        return f"Error generating content: {str(e)}"
-# Function to generate multiple-choice questions from content based on difficulty level
-def generate_questions(content, difficulty):
-    _ensure_openai_key()
-    try:
-        # Split the content into sentences or key points and shuffle them
-        content_sentences = content.split(".")  # Assuming content is in sentence form. If not, modify accordingly.
-        random.shuffle(content_sentences)
-        # Adjust question complexity based on difficulty
-        if difficulty == "easy":
-            question_instruction = "Generate 3 very simple multiple-choice questions based on the content. The questions should be very easy to understand."
-        elif difficulty == "medium":
-            question_instruction = "Generate 3 multiple-choice questions with moderate difficulty based on the content."
-        else:  # Hard difficulty
-            question_instruction = "Generate 3 challenging multiple-choice questions that require deep understanding of the content."
-        # prompt = f"{question_instruction}\nContent:\n{content}\n\nFormat the output like this:\n\n1. Question: What is XYZ?\nOptions: [Option 1, Option 2, Option 3, Option 4]\nCorrect Answer: Option 1\n\n2. Question: Why does XYZ happen?\nOptions: [Option 1, Option 2, Option 3, Option 4]\nCorrect Answer: Option 2"
-        prompt = f"{question_instruction}\nContent:\n{'. '.join(content_sentences[:3])}\n\nFormat the output like this:\n\n1. Question: What is XYZ?\nOptions: [Option 1, Option 2, Option 3, Option 4]\nCorrect Answer: Option 1\n\n2. Question: Why does XYZ happen?\nOptions: [Option 1, Option 2, Option 3, Option 4]\nCorrect Answer: Option 2"
-        response = openai.chat.completions.create(
-            model="gpt-3.5-turbo",
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant who generates educational multiple-choice questions."},
-                {"role": "user", "content": prompt}
-            ],
-            max_tokens=700,
-            temperature=0.7
-        )
-        questions = response.choices[0].message.content.strip()
-        return questions
-    except Exception as e:
-        return f"Error generating questions: {str(e)}"
-@reading_bp.route('/generate_content', methods=['POST'])
-# @app.route('/generate_content', methods=['POST'])
-def generate_content_route():
-    data = request.json
-    topic = data.get('topic')
-    difficulty = data.get('difficulty', 'medium')  # Default to medium if not provided
-    if not topic:
-        return jsonify({"error": "Topic is required"}), 400
-    if difficulty not in ["easy", "medium", "hard"]:
-        return jsonify({"error": "Invalid difficulty level. Choose 'easy', 'medium', or 'hard'."}), 400
-    content = generate_content(topic, difficulty)
-    return jsonify({"content": content})
-@reading_bp.route('/generate_questions', methods=['POST'])
-# @app.route('/generate_questions', methods=['POST'])
-def generate_questions_route():
-    data = request.json
-    content = data.get('content')
-    difficulty = data.get('difficulty', 'medium')  # Default to medium if not provided
-    if not content:
-        return jsonify({"error": "Content is required"}), 400
-    if difficulty not in ["easy", "medium", "hard"]:
-        return jsonify({"error": "Invalid difficulty level. Choose 'easy', 'medium', or 'hard'."}), 400
-    questions = generate_questions(content, difficulty)
-    return jsonify({"questions": questions})
-@reading_bp.route('/validate_answer', methods=['POST'])
-# @app.route('/validate_answer', methods=['POST'])
-def validate_answer():
-    question = request.json.get('question')
-    selected_answer = request.json.get('selected_answer')
-    if not question or not selected_answer:
-        return jsonify({"error": "Question and answer are required"}), 400
-    # Ensure both answers are stripped of leading/trailing spaces before comparison
-    correct_answer = question["correct_answer"].strip()
-    selected_answer = selected_answer.strip()
-    # Print the correct answer to the backend console for debugging
-    print(f"Correct Answer: {correct_answer}")
-    is_correct = selected_answer == correct_answer
-    return jsonify({"is_correct": is_correct, "correct_answer": correct_answer})
-# if __name__ == '__main__':
-#     app.run(debug=True)
-# if __name__ == '__main__':
-#     app.run(host='0.0.0.0', port=5001)
-# --- Optional: allow this file to run standalone locally while still using the blueprint ---
-if __name__ == '__main__':
-    app = Flask(__name__)
-    CORS(app)
-    # For local runs, pull key from env; no hard-coding
-    app.config["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY", "")
-    app.register_blueprint(reading_bp, url_prefix='')
-    app.run(host='0.0.0.0', port=5001, debug=True)

start.sh DELETED Viewed

@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-echo "== Container start =="
-echo "ENV=${ENV:-dev}"
-echo "CHROMA_ROOT=${CHROMA_ROOT:-/data/chroma}"
-# Ensure Chroma root exists
-mkdir -p "${CHROMA_ROOT}"
-# Decide whether ingestion is needed (if any level folder missing or empty)
-_need_ingest=0
-for level in low mid high; do
-  lvl_dir="${CHROMA_ROOT}/${level}"
-  if [ ! -d "$lvl_dir" ] || [ -z "$(ls -A "$lvl_dir" 2>/dev/null || true)" ]; then
-    _need_ingest=1
-  fi
-done
-if [ "${_need_ingest}" -eq 1 ]; then
-  echo "No (or empty) Chroma data found → running ingestion..."
-  # Ingest PDFs from /app/pdfs/{low,mid,high} into ${CHROMA_ROOT}/{low,mid,high}
-  python -m ragg.ingest_all || echo "WARNING: ingestion returned non-zero exit"
-else
-  echo "Chroma already present → skipping ingestion."
-fi
-# Start the API
-exec gunicorn --workers 2 --threads 4 --timeout 120 -b 0.0.0.0:7860 verification:app

trim/voice1.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:09d064bc2bd4880ceb1c6c4a69cb941a1b5e2ea05b151b721aab4cc17c34f56b
-size 5364878

verification.py CHANGED Viewed

@@ -1,530 +1,190 @@
-# --- load .env FIRST ---
 import os
 from dotenv import load_dotenv
-import requests
-from werkzeug.utils import secure_filename
-BASEDIR = os.path.abspath(os.path.dirname(__file__))
-load_dotenv(os.path.join(BASEDIR, ".env"))  # loads DB_USER, DB_PASSWORD, RUN_INIT_DB
-import socket
 import logging
-from threading import Lock
-from functools import wraps
-import datetime
-import bcrypt
-import jwt
-import pyodbc
-from flask import Flask, request, jsonify, make_response, current_app
 from flask_cors import CORS
-# ------------------------------------------------------------------------------
-# App, ENV, CORS
-# ------------------------------------------------------------------------------
-app = Flask(__name__)
-app.config['SECRET_KEY'] = '96c63da06374c1bde332516f3acbd23c84f35f90d8a6321a25d790a0a451af32'
-IS_PROD = os.getenv("ENV", "dev").lower() == "prod"
-_origins = os.getenv("ALLOWED_ORIGINS", "http://localhost:4200")
-ALLOWED_ORIGINS = [o.strip() for o in _origins.split(",") if o.strip()]
-# CORS(app, supports_credentials=True, origins=ALLOWED_ORIGINS)
-# Allow both localhost forms by default if env not set
-_default_origins = "http://localhost:4200,http://127.0.0.1:4200"
-_origins = os.getenv("ALLOWED_ORIGINS", _default_origins)
-ALLOWED_ORIGINS = [o.strip() for o in _origins.split(",") if o.strip()]
-CORS(
-    app,
-    resources={r"/*": {"origins": ALLOWED_ORIGINS}},
-    supports_credentials=True,
-    allow_headers=["Content-Type", "Authorization", "X-Requested-With", "X-User"],
-    expose_headers=["Set-Cookie"],
-    methods=["GET", "POST", "OPTIONS"]
-)
-def extract_username_from_request(req) -> str | None:
-    # 1) Header
-    hdr = req.headers.get("X-User")
-    if hdr:
-        return hdr
-    # 2) Body
-    data = req.get_json(silent=True) or {}
-    if data.get("username"):
-        return data.get("username")
-    # 3) JWT cookie from verification.py
-    token = req.cookies.get("access_token")
-    if token:
-        try:
-            payload = jwt.decode(token, current_app.config["SECRET_KEY"], algorithms=["HS256"])
-            return payload.get("username")
-        except jwt.ExpiredSignatureError:
-            return None
-        except jwt.InvalidTokenError:
-            return None
-    return None
-@app.after_request
-def add_cors_headers(resp):
-    origin = request.headers.get("Origin")
-    if origin and origin in ALLOWED_ORIGINS:
-        # echo the origin, never '*', when using credentials
-        resp.headers["Access-Control-Allow-Origin"] = origin
-        resp.headers["Vary"] = "Origin"
-        resp.headers["Access-Control-Allow-Credentials"] = "true"
-        resp.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization, X-Requested-With, X-User"
-        resp.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
-    return resp
-@app.before_request
-def handle_options_early():
-    if request.method == "OPTIONS":
-        resp = app.make_default_options_response()
         origin = request.headers.get("Origin")
         if origin and origin in ALLOWED_ORIGINS:
             resp.headers["Access-Control-Allow-Origin"] = origin
             resp.headers["Access-Control-Allow-Credentials"] = "true"
-        # Mirror browser's requested headers/methods
-        req_headers = request.headers.get("Access-Control-Request-Headers", "Content-Type, Authorization, X-Requested-With, X-User")
-        req_method = request.headers.get("Access-Control-Request-Method", "POST")
-        resp.headers["Access-Control-Allow-Headers"] = req_headers
-        resp.headers["Access-Control-Allow-Methods"] = req_method
         return resp
-logging.basicConfig(level=logging.INFO)
-# NEW: API keys / shared config for blueprints (read from HF Secrets/ENV)
-app.config["COHERE_API_KEY"] = os.getenv("COHERE_API_KEY", "")
-# ------------------------------------------------------------------------------
-# SQL Server configuration
-# ------------------------------------------------------------------------------
-# DB_SERVER = "pykara-sqlserver.cb60o04yk948.ap-south-1.rds.amazonaws.com,1433"
-# DB_DATABASE = "AuthenticationDB1"
-DB_SERVER   = os.getenv("DB_SERVER", r"(localdb)\MSSQLLocalDB")
-DB_DATABASE = os.getenv("DB_DATABASE", "AuthenticationDB1")
-DB_DRIVER   = os.getenv("DB_DRIVER", "ODBC Driver 17 for SQL Server")  # 17 in your image
-# Build connection string (FIXED)
-is_local = (
-    DB_SERVER.lower().startswith("localhost")
-    or DB_SERVER.startswith(".")
-    or DB_SERVER.lower().startswith("(localdb)")
-    or "\\" in DB_SERVER
-)
-if is_local:
-    # Windows local / LocalDB using modern ODBC driver
-    CONN_STR = (
-        f"DRIVER={{{DB_DRIVER}}};"
-        f"SERVER={DB_SERVER};"
-        f"DATABASE={DB_DATABASE};"
-        "Trusted_Connection=yes;"
-        "TrustServerCertificate=yes;"
-    )
-else:
-    # Remote SQL auth
-    CONN_STR = (
-        f"DRIVER={{{DB_DRIVER}}};"
-        f"SERVER={DB_SERVER};DATABASE={DB_DATABASE};"
-        f"UID={os.getenv('DB_USER')};PWD={os.getenv('DB_PASSWORD')};"
-        "Encrypt=yes;TrustServerCertificate=yes;"
-    )
-# def get_db_connection():
-#     """Create a short-timeout connection. Fail clearly if secrets are missing."""
-#     if "Trusted_Connection=yes" not in CONN_STR:
-#         if not os.getenv("DB_USER") or not os.getenv("DB_PASSWORD"):
-#             raise RuntimeError("DB_USER/DB_PASSWORD are not set in the environment.")
-#     return pyodbc.connect(CONN_STR, timeout=5)
-def get_db_connection():
-    """Create a short-timeout connection. Fail clearly if secrets are missing."""
-    if "Trusted_Connection=yes" not in CONN_STR:
-        if not os.getenv("DB_USER") or not os.getenv("DB_PASSWORD"):
-            raise RuntimeError("DB_USER/DB_PASSWORD are not set in the environment.")
-    return pyodbc.connect(CONN_STR, timeout=5)
-@app.get("/db/diag")
-def db_diag():
-    info = {}
-    try:
-        info["drivers_found"] = pyodbc.drivers()
-    except Exception as e:
-        info["drivers_found_error"] = str(e)
-    # Resolve host part (before comma if "host,port")
-    host = DB_SERVER.split(",")[0].strip()
-    info["db_server_env"] = DB_SERVER
-    info["db_database_env"] = DB_DATABASE
-    info["db_driver_env"] = DB_DRIVER
-    try:
-        ip = socket.gethostbyname(host)
-        info["dns_lookup"] = {"host": host, "ip": ip}
-    except Exception as e:
-        info["dns_lookup"] = {"host": host, "error": str(e)}
-    try:
-        conn = get_db_connection()
-        conn.close()
-        info["connect"] = "ok"
-    except Exception as e:
-        info["connect"] = f"error: {e}"
-    return jsonify(info), 200
-def init_db():
-    """Create tables if they do not exist."""
-    conn = get_db_connection()
-    cur = conn.cursor()
-    cur.execute("""
-        IF OBJECT_ID('Users', 'U') IS NULL
-        CREATE TABLE Users (
-            id INT IDENTITY(1,1) PRIMARY KEY,
-            username NVARCHAR(100) UNIQUE NOT NULL,
-            password_hash NVARCHAR(500) NOT NULL,
-            role NVARCHAR(50) DEFAULT 'user'
-        )
-    """)
-    cur.execute("""
-        IF OBJECT_ID('BlacklistedTokens', 'U') IS NULL
-        CREATE TABLE BlacklistedTokens (
-            id INT IDENTITY(1,1) PRIMARY KEY,
-            token NVARCHAR(1000) UNIQUE NOT NULL,
-            created_at DATETIME DEFAULT GETDATE()
-        )
-    """)
-    cur.execute("""
-        IF OBJECT_ID('RefreshTokens', 'U') IS NULL
-        CREATE TABLE RefreshTokens (
-            id INT IDENTITY(1,1) PRIMARY KEY,
-            username NVARCHAR(100) NOT NULL,
-            token NVARCHAR(1000) UNIQUE NOT NULL,
-            created_at DATETIME DEFAULT GETDATE(),
-            FOREIGN KEY (username) REFERENCES Users(username) ON DELETE CASCADE
-        )
-    """)
-    conn.commit()
-    conn.close()
-# ------------------------------------------------------------------------------
-# One-time DB initialisation (Flask 3.x safe)
-# ------------------------------------------------------------------------------
-_db_init_done = False
-_db_init_lock = Lock()
-_should_init = os.getenv("RUN_INIT_DB", "0") == "1"
-@app.before_request
-def maybe_init_db():
-    global _db_init_done
-    if _should_init and not _db_init_done:
-        with _db_init_lock:
-            if not _db_init_done:
-                try:
-                    init_db()
-                    app.logger.info("Database initialised.")
-                except Exception as e:
-                    app.logger.exception("DB init failed: %s", e)
-                finally:
-                    _db_init_done = True
-# ------------------------------------------------------------------------------
-# Cookie helpers
-# ------------------------------------------------------------------------------
-def add_cookie(resp, name: str, value: str, max_age: int):
-    """
-    In prod: Secure + SameSite=None + Partitioned (works with third-party cookie protections).
-    In dev:  SameSite=Lax, not Secure.
-    """
-    if IS_PROD:
-        resp.headers.add(
-            "Set-Cookie",
-            f"{name}={value}; Path=/; Max-Age={max_age}; Secure; HttpOnly; SameSite=None; Partitioned"
-        )
-    else:
-        resp.set_cookie(name, value, httponly=True, secure=False, samesite="Lax", max_age=max_age, path="/")
-# ------------------------------------------------------------------------------
-# Health
-# ------------------------------------------------------------------------------
-@app.get("/")
-def health():
-    return {"status": "ok"}, 200
-# ------------------------------------------------------------------------------
-# Auth utilities
-# ------------------------------------------------------------------------------
-from functools import wraps
-def token_required(f):
-    @wraps(f)
-    def decorated(*args, **kwargs):
-        token = request.cookies.get('access_token')
-        if not token:
-            return jsonify({"message": "Token is missing"}), 401
         try:
-            # Check blacklist
-            conn = get_db_connection()
-            cur = conn.cursor()
-            cur.execute("SELECT token FROM BlacklistedTokens WHERE token = ?", (token,))
-            if cur.fetchone():
-                conn.close()
-                return jsonify({"message": "Token has been revoked. Please log in again."}), 401
-            conn.close()
-            data = jwt.decode(token, app.config['SECRET_KEY'], algorithms=["HS256"])
-            return f(data['username'], *args, **kwargs)
-        except jwt.ExpiredSignatureError:
-            return jsonify({"message": "Token has expired"}), 401
-        except jwt.InvalidTokenError:
-            return jsonify({"message": "Invalid token"}), 401
-        except Exception as e:
-            app.logger.exception("Auth error: %s", e)
-            return jsonify({"message": "Server error"}), 500
-    return decorated
-# ------------------------------------------------------------------------------
-# Routes (verification/auth only)
-# ------------------------------------------------------------------------------
-@app.get("/dashboard")
-@token_required
-def dashboard(username):
-    return jsonify({"message": f"Welcome {username} to your dashboard!"})
-@app.post("/login")
-def login():
-    data = request.json or {}
-    username = data.get('username')
-    password = data.get('password')
-    try:
-        conn = get_db_connection()
-        cur = conn.cursor()
-        cur.execute("SELECT password_hash FROM Users WHERE username = ?", (username,))
-        row = cur.fetchone()
-        conn.close()
-    except Exception as e:
-        app.logger.exception("DB access error on login: %s", e)
-        return jsonify({"message": "Database is unavailable"}), 503
-    if not row:
-        return jsonify({"message": "Invalid credentials"}), 401
-    stored_hash = row[0]
-    if not bcrypt.checkpw(password.encode('utf-8'), stored_hash.encode('utf-8')):
-        return jsonify({"message": "Invalid credentials"}), 401
-    access_token = jwt.encode(
-        {'username': username, 'exp': datetime.datetime.utcnow() + datetime.timedelta(minutes=15)},
-        app.config['SECRET_KEY'],
-        algorithm="HS256"
-    )
-    refresh_token = jwt.encode(
-        {'username': username, 'exp': datetime.datetime.utcnow() + datetime.timedelta(days=7)},
-        app.config['SECRET_KEY'],
-        algorithm="HS256"
-    )
-    try:
-        conn = get_db_connection()
-        cur = conn.cursor()
-        cur.execute("INSERT INTO RefreshTokens (username, token) VALUES (?, ?)", (username, refresh_token))
-        conn.commit()
-        conn.close()
-    except Exception as e:
-        app.logger.exception("DB write error on login: %s", e)
-        return jsonify({"message": "Database is unavailable"}), 503
-    resp = make_response(jsonify({"message": "Login successful"}))
-    add_cookie(resp, 'access_token', access_token, 900)                 # 15 min
-    add_cookie(resp, 'refresh_token', refresh_token, 7*24*60*60)       # 7 days
-    return resp
-@app.post("/refresh")
-def refresh():
-    refresh_token = request.cookies.get("refresh_token")
-    if not refresh_token:
-        return jsonify({'message': 'Refresh token is missing'}), 400
-    try:
-        payload = jwt.decode(refresh_token, app.config['SECRET_KEY'], algorithms=["HS256"])
-    except jwt.ExpiredSignatureError:
-        return jsonify({'message': 'Refresh token has expired'}), 401
-    except jwt.InvalidTokenError:
-        return jsonify({'message': 'Invalid refresh token'}), 401
-    try:
-        conn = get_db_connection()
-        cur = conn.cursor()
-        cur.execute("SELECT username FROM RefreshTokens WHERE token = ?", (refresh_token,))
-        row = cur.fetchone()
-        conn.close()
-    except Exception as e:
-        app.logger.exception("DB access error on refresh: %s", e)
-        return jsonify({"message": "Database is unavailable"}), 503
-    if not row:
-        return jsonify({'message': 'Invalid refresh token'}), 401
-    username = row[0]
-    new_access = jwt.encode(
-        {'username': username, 'exp': datetime.datetime.utcnow() + datetime.timedelta(minutes=15)},
-        app.config['SECRET_KEY'],
-        algorithm="HS256"
-    )
-    resp = make_response(jsonify({'access_token': new_access}))
-    add_cookie(resp, 'access_token', new_access, 900)
-    return resp
-@app.post("/logout")
-@token_required
-def logout(username):
-    token = request.cookies.get('access_token')
-    if not token:
-        return jsonify({"message": "Invalid token format"}), 401
-    try:
-        data = jwt.decode(token, app.config['SECRET_KEY'], algorithms=["HS256"])
-        username = data['username']
-    except jwt.ExpiredSignatureError:
-        return jsonify({"message": "Token has expired"}), 401
-    except jwt.InvalidTokenError:
-        return jsonify({"message": "Invalid token"}), 401
-    try:
-        conn = get_db_connection()
-        cur = conn.cursor()
-        cur.execute("SELECT token FROM BlacklistedTokens WHERE token = ?", (token,))
-        if not cur.fetchone():
-            cur.execute("INSERT INTO BlacklistedTokens (token) VALUES (?)", (token,))
-        cur.execute("DELETE FROM RefreshTokens WHERE username = ?", (username,))
-        conn.commit()
-        conn.close()
-    except Exception as e:
-        app.logger.exception("DB write error on logout: %s", e)
-        return jsonify({"message": "Database is unavailable"}), 503
-    resp = make_response(jsonify({"message": "Logged out successfully!"}))
-    resp.delete_cookie('access_token', path='/')
-    resp.delete_cookie('refresh_token', path='/')
-    return resp
-# @app.post("/upload-pdf")
-# def upload_pdf():
-#     file = request.files.get("pdf")
-#     if not file:
-#         return jsonify({"error": "No file uploaded"}), 400
-#     upload_folder = os.path.join(BASEDIR, "pdfs")
-#     os.makedirs(upload_folder, exist_ok=True)
-#     save_path = os.path.join(upload_folder, file.filename)
-#     file.save(save_path)
-#     # You can optionally trigger RAG indexing here
-#     print(f"✅ PDF saved successfully at: {save_path}")
-#     return jsonify({"message": "PDF uploaded successfully", "path": save_path}), 200
-@app.post("/upload-pdf")
-def upload_pdf():
-    file = request.files.get("pdf")
-    if not file or file.filename.strip() == "":
-        return jsonify({"error": "No file uploaded"}), 400
-    # Save to your backend's pdfs folder (BASEDIR/pdfs)
-    upload_folder = os.path.join(BASEDIR, "pdfs")
-    os.makedirs(upload_folder, exist_ok=True)
-    filename = secure_filename(file.filename)
-    save_path = os.path.join(upload_folder, filename)
-    file.save(save_path)
-    print(f"✅ PDF saved successfully at: {save_path}")
-    # 🔔 Trigger RAG ingestion for THIS file (auto-ingest)
-    RAG_INGEST_URL = os.getenv("RAG_INGEST_URL", "http://localhost:7000/rag/ingest")
-    rag_result = {"status": "skipped"}
-    try:
-        payload = {
-            "paths": [save_path],   # ingest this single PDF
-            # optional tags (use if you plan to filter in RAG later)
-            "subject": "English",
-            "grade": "5"
-        }
-        resp = requests.post(RAG_INGEST_URL, json=payload, timeout=30)
-        resp.raise_for_status()
-        rag_result = resp.json()
-        print("✅ RAG ingest response:", rag_result)
-    except Exception as e:
-        # Do not fail the upload flow if ingest fails — just warn
-        print("⚠️ RAG ingest failed:", e)
-        rag_result = {"status": "warning", "message": f"RAG ingest failed: {str(e)}"}
-    # Frontend already sets localStorage.hasPDF = 'true'; this response is for debugging/visibility
-    return jsonify({
-        "message": "PDF uploaded successfully",
-        "path": save_path,
-        "rag": rag_result
-    }), 200
-@app.get("/check-auth")
-@token_required
-def check_auth(username):
-    return jsonify({"message": "Authenticated", "username": username}), 200
-# ------------------------------------------------------------------------------
-# Register Blueprint: grammar (and later media) lives in testmovie.py
-# ------------------------------------------------------------------------------
-from chat import movie_bp  # ensure testmovie.py defines movie_bp = Blueprint(...)
-from generateQuestion import questions_bp
-from reading import reading_bp
-from writting import writting_bp   # match the exact file name on Linux
-from vocabularyBuilder import vocab_bp
-from findingword import finding_bp
-from listen import listen_bp
-from ragg.app import rag_bp
-from pron import pron_bp
-from pronvideo import pronvideo_bp
-from pronragg import pronragg_bp
-from pronragupgrade import pronragupgrade_bp
-from ragg.ingest_trigger import ingest_trigger_bp
-app.register_blueprint(movie_bp, url_prefix="/media")
-app.register_blueprint(questions_bp, url_prefix="/media")
-app.register_blueprint(reading_bp, url_prefix="/media")
-app.register_blueprint(writting_bp, url_prefix="/media")
-app.register_blueprint(vocab_bp, url_prefix="/media")
-app.register_blueprint(finding_bp, url_prefix="/media")
-app.register_blueprint(listen_bp, url_prefix="/media")
-app.register_blueprint(rag_bp, url_prefix="/rag")
-app.register_blueprint(ingest_trigger_bp, url_prefix="/rag")
-app.register_blueprint(pron_bp, url_prefix="/pron")
-app.register_blueprint(pronvideo_bp, url_prefix="/pronvideo")
-app.register_blueprint(pronragg_bp, url_prefix="/pronragg")
-app.register_blueprint(pronragupgrade_bp, url_prefix="/pronragupgrade")
-# app.register_blueprint(questions_bp, url_prefix="/media")  # <-- add this
-# ------------------------------------------------------------------------------
-# Local run (Gunicorn will import `verification:app` on Spaces)
-# ------------------------------------------------------------------------------
 if __name__ == '__main__':
     port = int(os.getenv("PORT", "5000"))
-    app.run(host="0.0.0.0", port=port, debug=True)

+"""
+MJ Learn Backend - Main Flask Application
+A clean, professional Flask application with modular authentication.
+Main Features:
+- JWT-based authentication system
+- Role-based access control (admin/user)
+- Secure token management with blacklisting
+- CORS configuration for cross-origin requests
+- Modular blueprint architecture
+- Environment-based configuration
+"""
 import os
 from dotenv import load_dotenv
 import logging
+from flask import Flask, request
 from flask_cors import CORS
+# Load environment variables first
+BASEDIR = os.path.abspath(os.path.dirname(__file__))
+load_dotenv(os.path.join(BASEDIR, ".env"))
+# --- Build local ChromaDB at startup (expects build_chroma_db.py in same folder) ---
+_CHROMA_SCRIPT_PATH = os.path.join(BASEDIR, "build_chroma_db.py")
+if os.path.exists(_CHROMA_SCRIPT_PATH):
+    try:
+        import importlib.util
+        import traceback
+        spec = importlib.util.spec_from_file_location("build_chroma_db_local", _CHROMA_SCRIPT_PATH)
+        build_chroma_mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(build_chroma_mod)
+        if hasattr(build_chroma_mod, "build_chroma"):
+            # Run the builder to create Chroma DB in the local assets folder
+            build_chroma_mod.build_chroma()
+            print("✅ build_chroma_db.build_chroma() executed successfully.")
+        else:
+            print("!! build_chroma_db.py found but no `build_chroma()` function present.")
+    except Exception as e:
+        print(f"!! Failed to run build_chroma_db.py: {e}")
+        traceback.print_exc()
+else:
+    print("!! build_chroma_db.py not found in the application folder — skipping Chroma build.")
+# --- End ChromaDB build block ---
+# Import authentication module
+from auth import auth_bp
+from auth.database import ensure_database_initialized
+def create_app():
+    """Application factory pattern for Flask app creation"""
+    app = Flask(__name__)
+    # Security configuration
+    app.config['SECRET_KEY'] = os.getenv('SECRET_KEY')
+    if not app.config['SECRET_KEY']:
+        raise RuntimeError("SECRET_KEY must be set in environment variables!")
+    # Environment configuration
+    IS_PROD = os.getenv("ENV", "dev").lower() == "prod"
+    # CORS configuration
+    _default_origins = "http://localhost:4200,http://127.0.0.1:4200"
+    _origins = os.getenv("ALLOWED_ORIGINS", _default_origins)
+    ALLOWED_ORIGINS = [o.strip() for o in _origins.split(",") if o.strip()]
+    CORS(
+        app,
+        resources={r"/*": {"origins": ALLOWED_ORIGINS}},
+        supports_credentials=True,
+        allow_headers=["Content-Type", "Authorization", "X-Requested-With", "X-User"],
+        expose_headers=["Set-Cookie"],
+        methods=["GET", "POST", "OPTIONS"]
+    )
+    # API configuration for blueprints
+    app.config["COHERE_API_KEY"] = os.getenv("COHERE_API_KEY", "")
+    # CORS handlers
+    @app.after_request
+    def add_cors_headers(resp):
         origin = request.headers.get("Origin")
         if origin and origin in ALLOWED_ORIGINS:
             resp.headers["Access-Control-Allow-Origin"] = origin
+            resp.headers["Vary"] = "Origin"
             resp.headers["Access-Control-Allow-Credentials"] = "true"
+            resp.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization, X-Requested-With, X-User"
+            resp.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
         return resp
+    @app.before_request
+    def handle_options_early():
+        if request.method == "OPTIONS":
+            resp = app.make_default_options_response()
+            origin = request.headers.get("Origin")
+            if origin and origin in ALLOWED_ORIGINS:
+                resp.headers["Access-Control-Allow-Origin"] = origin
+                resp.headers["Access-Control-Allow-Credentials"] = "true"
+            # Mirror browser's requested headers/methods
+            req_headers = request.headers.get("Access-Control-Request-Headers", "Content-Type, Authorization, X-Requested-With, X-User")
+            req_method = request.headers.get("Access-Control-Request-Method", "POST")
+            resp.headers["Access-Control-Allow-Headers"] = req_headers
+            resp.headers["Access-Control-Allow-Methods"] = req_method
+            return resp
+    # Initialize database before first request (Flask 3.x compatible)
+    @app.before_request
+    def maybe_initialize_database():
+        if not hasattr(app, '_db_initialized'):
+            try:
+                ensure_database_initialized()
+                app._db_initialized = True
+            except Exception as e:
+                app.logger.exception("Database initialization failed: %s", e)
+    # Health check endpoint
+    @app.route("/")
+    def health():
+        return {"status": "ok", "service": "MJ Learn Backend"}, 200
+    # Register authentication blueprint
+    app.register_blueprint(auth_bp, url_prefix="/auth")
+    # Register other feature blueprints
+    register_feature_blueprints(app)
+    return app
+def register_feature_blueprints(app):
+    """Register feature blueprints with error handling"""
+    blueprints = [
+        ("ragg.app", "rag_bp", "/rag"),
+        ("pronunciation", "pronunciation_bp", "/pronunciation"),
+        ("ragg.ingest_trigger", "ingest_trigger_bp", "/rag"),
+    ]
+    for module_name, blueprint_name, url_prefix in blueprints:
         try:
+            module = __import__(module_name, fromlist=[blueprint_name])
+            blueprint = getattr(module, blueprint_name)
+            app.register_blueprint(blueprint, url_prefix=url_prefix)
+            print(f"? Registered {blueprint_name}")
+        except ImportError as e:
+            print(f"?? Could not import {blueprint_name}: {e}")
+        except AttributeError as e:
+            print(f"?? Blueprint {blueprint_name} not found in {module_name}: {e}")
+# Create Flask app instance
+app = create_app()
+# Configure logging
+logging.basicConfig(level=logging.INFO)
 if __name__ == '__main__':
+    print("?? Starting MJ Learn Backend...")
+    print(f"? SECRET_KEY loaded: {bool(app.config.get('SECRET_KEY'))}")
+    print(f"? Environment: {os.getenv('ENV', 'development')}")
+    print("=" * 60)
     port = int(os.getenv("PORT", "5000"))
+    print(f"?? Server starting on http://localhost:{port}")
+    print("?? Available endpoints:")
+    print("   GET  /                    - Health check")
+    print("   ?? Authentication:")
+    print("   POST /auth/signup         - User registration")
+    print("   POST /auth/login          - User login")
+    print("   POST /auth/refresh        - Token refresh")
+    print("   POST /auth/logout         - User logout")
+    print("   GET  /auth/dashboard      - Protected endpoint")
+    print("   GET  /auth/check-auth     - Auth status check")
+    print("   GET  /auth/db/diag        - Database diagnostics (ADMIN)")
+    print("   ?? Admin Management:")
+    print("   GET  /auth/admin/users          - List all users (ADMIN)")
+    print("   POST /auth/admin/promote-user   - Promote user to admin (ADMIN)")
+    print("   POST /auth/admin/create-first-admin - Create first admin")
+    print("=" * 60)
+    try:
+        app.run(host="0.0.0.0", port=port, debug=True)
+    except Exception as e:
+        print(f"? Failed to start server: {e}")
+        raise