import json import logging import sqlite3 from datetime import datetime from typing import Optional import config logger = logging.getLogger(__name__) DB_PATH = config.SQLITE_DB_PATH def init_db(): """Initialize SQLite database.""" conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() cursor.execute(""" CREATE TABLE IF NOT EXISTS documents ( id INTEGER PRIMARY KEY AUTOINCREMENT, document_id TEXT UNIQUE NOT NULL, filename TEXT, file_extension TEXT, document_type TEXT, language TEXT, is_arabic INTEGER DEFAULT 0, total_pages INTEGER DEFAULT 0, total_words INTEGER DEFAULT 0, extraction_source TEXT, confidence REAL DEFAULT 0.0, full_text TEXT, structured_json TEXT, processed_at TEXT, created_at TEXT DEFAULT CURRENT_TIMESTAMP ) """) cursor.execute(""" CREATE TABLE IF NOT EXISTS entities ( id INTEGER PRIMARY KEY AUTOINCREMENT, document_id TEXT, entity_type TEXT, entity_value TEXT, created_at TEXT DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (document_id) REFERENCES documents(document_id) ) """) cursor.execute(""" CREATE TABLE IF NOT EXISTS benchmark_results ( id INTEGER PRIMARY KEY AUTOINCREMENT, test_name TEXT, language TEXT, extraction_source TEXT, word_count INTEGER, char_count INTEGER, confidence REAL, processing_time_ms REAL, arabic_char_ratio REAL, run_at TEXT DEFAULT CURRENT_TIMESTAMP ) """) cursor.execute( "CREATE INDEX IF NOT EXISTS idx_doc_id ON documents(document_id)" ) cursor.execute( "CREATE INDEX IF NOT EXISTS idx_language ON documents(language)" ) cursor.execute( "CREATE INDEX IF NOT EXISTS idx_doc_type ON documents(document_type)" ) conn.commit() conn.close() logger.info(f"Database initialized at {DB_PATH}") def save_document(structured_doc: dict) -> bool: """Save a structured document to the database.""" try: init_db() conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() meta = structured_doc.get("metadata", {}) analysis = structured_doc.get("document_analysis", {}) content = structured_doc.get("content", {}) cursor.execute( """ INSERT OR REPLACE INTO documents ( document_id, filename, file_extension, document_type, language, is_arabic, total_pages, total_words, extraction_source, confidence, full_text, structured_json, processed_at ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( structured_doc.get("document_id"), meta.get("filename"), meta.get("file_extension"), analysis.get("document_type"), analysis.get("language"), 1 if analysis.get("is_arabic") else 0, analysis.get("total_pages", 0), analysis.get("total_words", 0), meta.get("extraction_source"), meta.get("extraction_confidence", 0.0), content.get("full_text", ""), json.dumps(structured_doc, ensure_ascii=False), meta.get("processed_at"), ), ) doc_id = structured_doc.get("document_id") entities = structured_doc.get("extracted_entities", {}) for entity_type, values in entities.items(): for value in values if isinstance(values, list) else [values]: if value: cursor.execute( """ INSERT INTO entities (document_id, entity_type, entity_value) VALUES (?, ?, ?) """, (doc_id, entity_type, str(value)), ) conn.commit() conn.close() logger.info(f"Saved document: {doc_id}") return True except Exception as e: logger.error(f"Failed to save document: {e}") return False def get_all_documents(limit: int = 50, offset: int = 0) -> list: """Retrieve all documents.""" try: init_db() conn = sqlite3.connect(DB_PATH) conn.row_factory = sqlite3.Row cursor = conn.cursor() cursor.execute( """ SELECT document_id, filename, document_type, language, is_arabic, total_pages, total_words, extraction_source, confidence, processed_at, created_at FROM documents ORDER BY created_at DESC LIMIT ? OFFSET ? """, (limit, offset), ) rows = [dict(row) for row in cursor.fetchall()] conn.close() return rows except Exception as e: logger.error(f"Failed to retrieve documents: {e}") return [] def get_document_by_id(document_id: str) -> Optional[dict]: """Retrieve a specific document by ID.""" try: init_db() conn = sqlite3.connect(DB_PATH) conn.row_factory = sqlite3.Row cursor = conn.cursor() cursor.execute( "SELECT * FROM documents WHERE document_id = ?", (document_id,), ) row = cursor.fetchone() conn.close() if row: doc = dict(row) if doc.get("structured_json"): doc["structured_data"] = json.loads(doc["structured_json"]) return doc return None except Exception as e: logger.error(f"Failed to get document {document_id}: {e}") return None def search_documents(query: str, language: str = None) -> list: """Search documents by text content.""" try: init_db() conn = sqlite3.connect(DB_PATH) conn.row_factory = sqlite3.Row cursor = conn.cursor() if language: cursor.execute( """ SELECT document_id, filename, document_type, language, total_words, confidence, processed_at FROM documents WHERE full_text LIKE ? AND language = ? ORDER BY created_at DESC LIMIT 20 """, (f"%{query}%", language), ) else: cursor.execute( """ SELECT document_id, filename, document_type, language, total_words, confidence, processed_at FROM documents WHERE full_text LIKE ? ORDER BY created_at DESC LIMIT 20 """, (f"%{query}%",), ) rows = [dict(row) for row in cursor.fetchall()] conn.close() return rows except Exception as e: logger.error(f"Search failed: {e}") return [] def get_stats() -> dict: """Get overall database statistics.""" try: init_db() conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() cursor.execute("SELECT COUNT(*) FROM documents") total = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM documents WHERE is_arabic = 1") arabic_count = cursor.fetchone()[0] cursor.execute( "SELECT document_type, COUNT(*) as cnt FROM documents GROUP BY document_type" ) by_type = dict(cursor.fetchall()) cursor.execute("SELECT AVG(confidence) FROM documents WHERE confidence > 0") avg_conf = cursor.fetchone()[0] or 0.0 cursor.execute("SELECT SUM(total_words) FROM documents") total_words = cursor.fetchone()[0] or 0 conn.close() return { "total_documents": total, "arabic_documents": arabic_count, "english_documents": total - arabic_count, "by_document_type": by_type, "average_confidence": round(avg_conf, 4), "total_words_processed": total_words, } except Exception as e: logger.error(f"Stats failed: {e}") return {}