""" Database manager for storing translations and corrections Uses SQLite for simplicity """ import sqlite3 import logging from datetime import datetime from typing import List, Dict, Optional, Any import os logger = logging.getLogger(__name__) class DatabaseManager: """Manages SQLite database for translation storage""" def __init__(self, db_path: str = "../data/translations.db"): self.db_path = db_path self.ensure_db_directory() def ensure_db_directory(self): """Ensure the database directory exists""" os.makedirs(os.path.dirname(os.path.abspath(self.db_path)), exist_ok=True) def get_connection(self) -> sqlite3.Connection: """Get database connection""" conn = sqlite3.connect(self.db_path) conn.row_factory = sqlite3.Row # Enable column access by name return conn def initialize_database(self): """Initialize database tables""" try: with self.get_connection() as conn: # Create translations table conn.execute(""" CREATE TABLE IF NOT EXISTS translations ( id INTEGER PRIMARY KEY AUTOINCREMENT, original_text TEXT NOT NULL, translated_text TEXT NOT NULL, source_language TEXT NOT NULL, target_language TEXT NOT NULL, model_confidence REAL DEFAULT 0.0, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) # Create corrections table conn.execute(""" CREATE TABLE IF NOT EXISTS corrections ( id INTEGER PRIMARY KEY AUTOINCREMENT, translation_id INTEGER NOT NULL, corrected_text TEXT NOT NULL, feedback TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (translation_id) REFERENCES translations (id) ) """) # Create indexes for better performance conn.execute(""" CREATE INDEX IF NOT EXISTS idx_translations_languages ON translations (source_language, target_language) """) conn.execute(""" CREATE INDEX IF NOT EXISTS idx_translations_created ON translations (created_at) """) conn.execute(""" CREATE INDEX IF NOT EXISTS idx_corrections_translation ON corrections (translation_id) """) conn.commit() logger.info("Database initialized successfully") except Exception as e: logger.error(f"Database initialization error: {str(e)}") raise def store_translation( self, original_text: str, translated_text: str, source_language: str, target_language: str, model_confidence: float = 0.0 ) -> int: """ Store a translation in the database Args: original_text: Original text translated_text: Translated text source_language: Source language code target_language: Target language code model_confidence: Model confidence score Returns: Translation ID """ try: with self.get_connection() as conn: cursor = conn.execute(""" INSERT INTO translations (original_text, translated_text, source_language, target_language, model_confidence) VALUES (?, ?, ?, ?, ?) """, (original_text, translated_text, source_language, target_language, model_confidence)) translation_id = cursor.lastrowid conn.commit() logger.info(f"Translation stored with ID: {translation_id}") return translation_id except Exception as e: logger.error(f"Error storing translation: {str(e)}") raise def store_correction( self, translation_id: int, corrected_text: str, feedback: Optional[str] = None ) -> int: """ Store a correction for a translation Args: translation_id: ID of the original translation corrected_text: Corrected text feedback: Optional feedback about the correction Returns: Correction ID """ try: with self.get_connection() as conn: cursor = conn.execute(""" INSERT INTO corrections (translation_id, corrected_text, feedback) VALUES (?, ?, ?) """, (translation_id, corrected_text, feedback)) correction_id = cursor.lastrowid conn.commit() logger.info(f"Correction stored with ID: {correction_id}") return correction_id except Exception as e: logger.error(f"Error storing correction: {str(e)}") raise def get_translation_history( self, limit: int = 50, offset: int = 0, source_language: Optional[str] = None, target_language: Optional[str] = None ) -> List[Dict[str, Any]]: """ Get translation history Args: limit: Maximum number of records to return offset: Number of records to skip source_language: Filter by source language target_language: Filter by target language Returns: List of translation history records """ try: with self.get_connection() as conn: # Build query with optional filters where_conditions = [] params = [] if source_language: where_conditions.append("t.source_language = ?") params.append(source_language) if target_language: where_conditions.append("t.target_language = ?") params.append(target_language) where_clause = "" if where_conditions: where_clause = "WHERE " + " AND ".join(where_conditions) query = f""" SELECT t.id, t.original_text, t.translated_text, t.source_language, t.target_language, t.model_confidence, t.created_at, c.corrected_text, c.feedback as correction_feedback FROM translations t LEFT JOIN corrections c ON t.id = c.translation_id {where_clause} ORDER BY t.created_at DESC LIMIT ? OFFSET ? """ params.extend([limit, offset]) cursor = conn.execute(query, params) rows = cursor.fetchall() # Convert to dictionaries results = [] for row in rows: results.append({ "id": row["id"], "original_text": row["original_text"], "translated_text": row["translated_text"], "source_language": row["source_language"], "target_language": row["target_language"], "model_confidence": row["model_confidence"], "created_at": row["created_at"], "corrected_text": row["corrected_text"], "correction_feedback": row["correction_feedback"] }) return results except Exception as e: logger.error(f"Error retrieving translation history: {str(e)}") raise def get_translation_by_id(self, translation_id: int) -> Optional[Dict[str, Any]]: """ Get a specific translation by ID Args: translation_id: Translation ID Returns: Translation record or None if not found """ try: with self.get_connection() as conn: cursor = conn.execute(""" SELECT t.id, t.original_text, t.translated_text, t.source_language, t.target_language, t.model_confidence, t.created_at, c.corrected_text, c.feedback as correction_feedback FROM translations t LEFT JOIN corrections c ON t.id = c.translation_id WHERE t.id = ? """, (translation_id,)) row = cursor.fetchone() if row: return { "id": row["id"], "original_text": row["original_text"], "translated_text": row["translated_text"], "source_language": row["source_language"], "target_language": row["target_language"], "model_confidence": row["model_confidence"], "created_at": row["created_at"], "corrected_text": row["corrected_text"], "correction_feedback": row["correction_feedback"] } return None except Exception as e: logger.error(f"Error retrieving translation {translation_id}: {str(e)}") raise def get_corrections_for_training(self, limit: int = 1000) -> List[Dict[str, Any]]: """ Get corrections that can be used for model fine-tuning Args: limit: Maximum number of corrections to return Returns: List of correction records suitable for training """ try: with self.get_connection() as conn: cursor = conn.execute(""" SELECT t.original_text, t.source_language, t.target_language, c.corrected_text, c.feedback, c.created_at FROM corrections c JOIN translations t ON c.translation_id = t.id ORDER BY c.created_at DESC LIMIT ? """, (limit,)) rows = cursor.fetchall() results = [] for row in rows: results.append({ "original_text": row["original_text"], "source_language": row["source_language"], "target_language": row["target_language"], "corrected_text": row["corrected_text"], "feedback": row["feedback"], "created_at": row["created_at"] }) return results except Exception as e: logger.error(f"Error retrieving corrections for training: {str(e)}") raise def get_statistics(self) -> Dict[str, Any]: """ Get database statistics Returns: Dictionary with various statistics """ try: with self.get_connection() as conn: # Total translations cursor = conn.execute("SELECT COUNT(*) FROM translations") total_translations = cursor.fetchone()[0] # Total corrections cursor = conn.execute("SELECT COUNT(*) FROM corrections") total_corrections = cursor.fetchone()[0] # Translations by language pair cursor = conn.execute(""" SELECT source_language, target_language, COUNT(*) as count FROM translations GROUP BY source_language, target_language ORDER BY count DESC """) language_pairs = cursor.fetchall() # Recent activity (last 7 days) cursor = conn.execute(""" SELECT COUNT(*) FROM translations WHERE created_at >= datetime('now', '-7 days') """) recent_translations = cursor.fetchone()[0] return { "total_translations": total_translations, "total_corrections": total_corrections, "recent_translations": recent_translations, "language_pairs": [ { "source": row["source_language"], "target": row["target_language"], "count": row["count"] } for row in language_pairs ] } except Exception as e: logger.error(f"Error retrieving statistics: {str(e)}") raise def cleanup_old_records(self, days: int = 30): """ Clean up old translation records Args: days: Number of days to keep records """ try: with self.get_connection() as conn: # Delete old corrections first (due to foreign key constraint) cursor = conn.execute(""" DELETE FROM corrections WHERE translation_id IN ( SELECT id FROM translations WHERE created_at < datetime('now', '-' || ? || ' days') ) """, (days,)) deleted_corrections = cursor.rowcount # Delete old translations cursor = conn.execute(""" DELETE FROM translations WHERE created_at < datetime('now', '-' || ? || ' days') """, (days,)) deleted_translations = cursor.rowcount conn.commit() logger.info(f"Cleaned up {deleted_translations} translations and {deleted_corrections} corrections older than {days} days") except Exception as e: logger.error(f"Error during cleanup: {str(e)}") raise