Spaces:
Sleeping
Sleeping
| """ | |
| VidyaBot Database Schema & Initialization | |
| Handles SQLite schema creation, connection pooling, and lifecycle management. | |
| All data is stored in a single .db file for portability. | |
| """ | |
| import sqlite3 | |
| import os | |
| import numpy as np | |
| from pathlib import Path | |
| from typing import Optional | |
| from datetime import datetime | |
| from backend.config import settings | |
| # Global connection pool (simplified - production would use proper pooling) | |
| _db_connection: Optional[sqlite3.Connection] = None | |
| def get_db_connection() -> sqlite3.Connection: | |
| """Get or create database connection.""" | |
| global _db_connection | |
| if _db_connection is None: | |
| db_path = settings.DB_PATH | |
| os.makedirs(os.path.dirname(db_path), exist_ok=True) | |
| _db_connection = sqlite3.connect(db_path, check_same_thread=False) | |
| _db_connection.row_factory = sqlite3.Row # Allow dict-like access | |
| return _db_connection | |
| def init_db() -> None: | |
| """Initialize database schema. Idempotent — safe to call multiple times.""" | |
| conn = get_db_connection() | |
| cursor = conn.cursor() | |
| # Create textbooks table | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS textbooks ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| filename TEXT UNIQUE NOT NULL, | |
| title TEXT, | |
| board TEXT, | |
| subject TEXT, | |
| grade TEXT, | |
| total_pages INTEGER, | |
| total_chunks INTEGER, | |
| ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| """) | |
| # Create chunks table | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS chunks ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| textbook_id INTEGER REFERENCES textbooks(id) ON DELETE CASCADE, | |
| chapter_number INTEGER, | |
| chapter_title TEXT, | |
| section_title TEXT, | |
| page_number INTEGER, | |
| chunk_index INTEGER, | |
| content TEXT NOT NULL, | |
| token_count INTEGER, | |
| embedding BLOB, | |
| UNIQUE(textbook_id, chapter_number, chunk_index) | |
| ) | |
| """) | |
| # Create BM25 index table | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS bm25_index ( | |
| chunk_id INTEGER REFERENCES chunks(id) ON DELETE CASCADE, | |
| term TEXT, | |
| tf_idf REAL, | |
| PRIMARY KEY (chunk_id, term) | |
| ) | |
| """) | |
| # Create query cache table | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS query_cache ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| query_hash TEXT UNIQUE NOT NULL, | |
| query_text TEXT NOT NULL, | |
| textbook_id INTEGER, | |
| language TEXT DEFAULT 'english', | |
| answer TEXT NOT NULL, | |
| context_tokens_used INTEGER, | |
| model_used TEXT, | |
| pruning_ratio REAL, | |
| source_pages TEXT, | |
| created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | |
| accessed_count INTEGER DEFAULT 0, | |
| last_accessed TIMESTAMP | |
| ) | |
| """) | |
| # Create cost tracking table | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS cost_log ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| query_id INTEGER, | |
| textbook_id INTEGER, | |
| baseline_tokens INTEGER DEFAULT 2000, | |
| actual_tokens_used INTEGER, | |
| tokens_saved INTEGER, | |
| cost_usd REAL, | |
| cost_saved_usd REAL, | |
| cache_hit BOOLEAN DEFAULT 0, | |
| model_used TEXT, | |
| timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| """) | |
| # Create FAISS index metadata table (for tracking indexed textbooks) | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS faiss_metadata ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| textbook_id INTEGER UNIQUE REFERENCES textbooks(id) ON DELETE CASCADE, | |
| faiss_index_size INTEGER, | |
| indexed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| """) | |
| # Create BM25 metadata table (for tracking indexed textbooks) | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS bm25_metadata ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| textbook_id INTEGER UNIQUE REFERENCES textbooks(id) ON DELETE CASCADE, | |
| indexed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| """) | |
| # V2 UPGRADE TABLES — Curriculum routing + Sentence pruning + Teacher analytics | |
| # Chapter-level curriculum tags (added during ingestion) | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS chapter_tags ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| textbook_id INTEGER REFERENCES textbooks(id) ON DELETE CASCADE, | |
| chapter_number INTEGER, | |
| subject_domain TEXT, | |
| bloom_levels TEXT, | |
| keywords TEXT, | |
| UNIQUE(textbook_id, chapter_number) | |
| ) | |
| """) | |
| # Sentence-level pruning log (for analytics) | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS pruning_log ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| query_id INTEGER, | |
| stage TEXT, | |
| chunks_in INTEGER, | |
| chunks_out INTEGER, | |
| tokens_in INTEGER, | |
| tokens_out INTEGER, | |
| latency_ms INTEGER, | |
| timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| """) | |
| # Teacher analytics cache (pre-computed daily) | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS teacher_analytics ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| textbook_id INTEGER REFERENCES textbooks(id) ON DELETE CASCADE, | |
| date TEXT, | |
| top_questions TEXT, | |
| weak_chapters TEXT, | |
| hourly_usage TEXT, | |
| computed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| """) | |
| # Add new columns to existing tables (if they don't exist) | |
| try: | |
| cursor.execute("ALTER TABLE chunks ADD COLUMN subject_domain TEXT") | |
| except sqlite3.OperationalError: | |
| pass # Column already exists | |
| try: | |
| cursor.execute("ALTER TABLE chunks ADD COLUMN bloom_level TEXT DEFAULT 'recall'") | |
| except sqlite3.OperationalError: | |
| pass | |
| try: | |
| cursor.execute("ALTER TABLE cost_log ADD COLUMN interface TEXT DEFAULT 'web'") | |
| except sqlite3.OperationalError: | |
| pass | |
| conn.commit() | |
| print("[DB] Database schema initialized successfully") | |
| def get_db() -> sqlite3.Connection: | |
| """Get database connection for use in routes.""" | |
| return get_db_connection() | |
| def close_db() -> None: | |
| """Close database connection.""" | |
| global _db_connection | |
| if _db_connection: | |
| _db_connection.close() | |
| _db_connection = None | |
| class Textbook: | |
| """DTO for textbook metadata.""" | |
| def __init__(self, id: int, filename: str, title: str, board: str, | |
| subject: str, grade: str, total_pages: int, total_chunks: int, | |
| ingested_at: str): | |
| self.id = id | |
| self.filename = filename | |
| self.title = title | |
| self.board = board | |
| self.subject = subject | |
| self.grade = grade | |
| self.total_pages = total_pages | |
| self.total_chunks = total_chunks | |
| self.ingested_at = ingested_at | |
| def to_dict(self): | |
| return { | |
| "id": self.id, | |
| "filename": self.filename, | |
| "title": self.title, | |
| "board": self.board, | |
| "subject": self.subject, | |
| "grade": self.grade, | |
| "total_pages": self.total_pages, | |
| "total_chunks": self.total_chunks, | |
| "ingested_at": self.ingested_at | |
| } | |
| class Chunk: | |
| """DTO for text chunks.""" | |
| def __init__(self, id: int, textbook_id: int, chapter_number: int, | |
| chapter_title: str, section_title: str, page_number: int, | |
| chunk_index: int, content: str, token_count: int, | |
| embedding_bytes: Optional[bytes] = None): | |
| self.id = id | |
| self.textbook_id = textbook_id | |
| self.chapter_number = chapter_number | |
| self.chapter_title = chapter_title | |
| self.section_title = section_title | |
| self.page_number = page_number | |
| self.chunk_index = chunk_index | |
| self.content = content | |
| self.token_count = token_count | |
| self._embedding_bytes = embedding_bytes | |
| def embedding(self) -> Optional[np.ndarray]: | |
| """Deserialize embedding from bytes.""" | |
| if self._embedding_bytes is None: | |
| return None | |
| return np.frombuffer(self._embedding_bytes, dtype=np.float32) | |
| def embedding(self, value: np.ndarray): | |
| """Serialize embedding to bytes.""" | |
| if value is not None: | |
| self._embedding_bytes = value.astype(np.float32).tobytes() | |
| else: | |
| self._embedding_bytes = None | |
| def to_dict(self): | |
| return { | |
| "id": self.id, | |
| "textbook_id": self.textbook_id, | |
| "chapter_number": self.chapter_number, | |
| "chapter_title": self.chapter_title, | |
| "section_title": self.section_title, | |
| "page_number": self.page_number, | |
| "chunk_index": self.chunk_index, | |
| "content": self.content, | |
| "token_count": self.token_count | |
| } | |
| class PruningResult: | |
| """DTO for context pruning output.""" | |
| def __init__(self, chunks: list, total_tokens: int, | |
| baseline_tokens: int = 2000, pruning_ratio: float = 0.0, | |
| stage_timings: dict = None): | |
| self.chunks = chunks | |
| self.total_tokens = total_tokens | |
| self.baseline_tokens = baseline_tokens | |
| self.pruning_ratio = pruning_ratio # e.g., 0.80 = 80% reduction | |
| self.stage_timings = stage_timings or {} | |
| self.tokens_saved = baseline_tokens - total_tokens | |
| def to_dict(self): | |
| return { | |
| "chunks": [c.to_dict() for c in self.chunks], | |
| "total_tokens": self.total_tokens, | |
| "baseline_tokens": self.baseline_tokens, | |
| "tokens_saved": self.tokens_saved, | |
| "pruning_ratio": self.pruning_ratio, | |
| "stage_timings": self.stage_timings | |
| } | |
| class CostLog: | |
| """DTO for cost tracking.""" | |
| def __init__(self, query_id: int, baseline_tokens: int, actual_tokens_used: int, | |
| cache_hit: bool = False, model_used: str = "claude-haiku-4-5-20251001"): | |
| self.query_id = query_id | |
| self.baseline_tokens = baseline_tokens | |
| self.actual_tokens_used = actual_tokens_used | |
| self.tokens_saved = baseline_tokens - actual_tokens_used | |
| self.cache_hit = cache_hit | |
| self.model_used = model_used | |
| # Calculate costs (Haiku pricing for Claude, 0 for local Ollama) | |
| is_ollama = (settings.LLM_BACKEND == "ollama") or ("claude" not in model_used.lower()) | |
| if is_ollama: | |
| self.cost_usd = 0.0 | |
| else: | |
| self.cost_usd = (actual_tokens_used / 1_000_000) * settings.HAIKU_INPUT_COST_PER_1M | |
| baseline_cost = (baseline_tokens / 1_000_000) * settings.HAIKU_INPUT_COST_PER_1M | |
| self.cost_saved_usd = baseline_cost - self.cost_usd | |
| def to_dict(self): | |
| return { | |
| "query_id": self.query_id, | |
| "baseline_tokens": self.baseline_tokens, | |
| "actual_tokens_used": self.actual_tokens_used, | |
| "tokens_saved": self.tokens_saved, | |
| "cost_usd": self.cost_usd, | |
| "cost_saved_usd": self.cost_saved_usd, | |
| "cache_hit": self.cache_hit, | |
| "model_used": self.model_used | |
| } | |
| class LLMResponse: | |
| """DTO for LLM API responses.""" | |
| def __init__(self, answer: str, input_tokens: int, output_tokens: int, | |
| model: str = "claude-haiku-4-5-20251001"): | |
| self.answer = answer | |
| self.input_tokens = input_tokens | |
| self.output_tokens = output_tokens | |
| self.total_tokens = input_tokens + output_tokens | |
| self.model = model | |
| # Calculate cost | |
| is_ollama = (settings.LLM_BACKEND == "ollama") or ("claude" not in model.lower()) | |
| if is_ollama: | |
| self.cost_usd = 0.0 | |
| else: | |
| self.cost_usd = ( | |
| (input_tokens / 1_000_000) * settings.HAIKU_INPUT_COST_PER_1M + | |
| (output_tokens / 1_000_000) * settings.HAIKU_OUTPUT_COST_PER_1M | |
| ) | |
| def to_dict(self): | |
| return { | |
| "answer": self.answer, | |
| "input_tokens": self.input_tokens, | |
| "output_tokens": self.output_tokens, | |
| "total_tokens": self.total_tokens, | |
| "cost_usd": self.cost_usd, | |
| "model": self.model | |
| } | |