Really-amin commited on
Commit
f022ccd
·
verified ·
1 Parent(s): ccbebf0

Upload 2 files

Browse files
Files changed (2) hide show
  1. data/iran_legal.db +0 -0
  2. src/streamlit_app.py +1039 -0
data/iran_legal.db ADDED
File without changes
src/streamlit_app.py ADDED
@@ -0,0 +1,1039 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Iran Legal Information Dashboard - Enhanced Version
4
+ ==================================================
5
+ Complete Working System with Robust Database Management, OCR, AI Analysis, and Web Scraping
6
+ Designed for Hugging Face Spaces deployment with enhanced error handling
7
+ """
8
+
9
+ import streamlit as st
10
+ import pandas as pd
11
+ import plotly.express as px
12
+ import plotly.graph_objects as go
13
+ import sqlite3
14
+ import os
15
+ import tempfile
16
+ import io
17
+ import json
18
+ import hashlib
19
+ import logging
20
+ import time
21
+ import re
22
+ import asyncio
23
+ import sys
24
+ from datetime import datetime, timedelta
25
+ from typing import Dict, List, Optional, Any, Tuple
26
+ from urllib.parse import urlparse, urljoin
27
+ from contextlib import contextmanager
28
+ import requests
29
+ from bs4 import BeautifulSoup
30
+ import base64
31
+ from pathlib import Path
32
+
33
+ # Configure logging
34
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
35
+ logger = logging.getLogger(__name__)
36
+
37
+ # Page configuration
38
+ st.set_page_config(
39
+ page_title="داشبورد اطلاعات حقوقی ایران",
40
+ page_icon="⚖️",
41
+ layout="wide",
42
+ initial_sidebar_state="expanded"
43
+ )
44
+
45
+ # Advanced CSS for beautiful UI (same as before)
46
+ def load_css():
47
+ st.markdown("""
48
+ <style>
49
+ @import url('https://fonts.googleapis.com/css2?family=Vazir:wght@300;400;500;600;700&display=swap');
50
+
51
+ /* Global Styles */
52
+ .main {
53
+ font-family: 'Vazir', 'Segoe UI', sans-serif;
54
+ direction: rtl;
55
+ text-align: right;
56
+ background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
57
+ min-height: 100vh;
58
+ }
59
+
60
+ /* Header Styles */
61
+ .main-header {
62
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
63
+ padding: 2rem;
64
+ border-radius: 20px;
65
+ color: white;
66
+ margin-bottom: 2rem;
67
+ text-align: center;
68
+ box-shadow: 0 20px 40px rgba(102, 126, 234, 0.3);
69
+ position: relative;
70
+ overflow: hidden;
71
+ }
72
+
73
+ .main-header::before {
74
+ content: '';
75
+ position: absolute;
76
+ top: -50%;
77
+ left: -50%;
78
+ width: 200%;
79
+ height: 200%;
80
+ background: linear-gradient(45deg, transparent, rgba(255,255,255,0.1), transparent);
81
+ transform: rotate(45deg);
82
+ animation: shine 3s infinite;
83
+ }
84
+
85
+ @keyframes shine {
86
+ 0% { transform: translateX(-100%) translateY(-100%) rotate(45deg); }
87
+ 100% { transform: translateX(100%) translateY(100%) rotate(45deg); }
88
+ }
89
+
90
+ .main-header h1 {
91
+ font-size: 2.5rem;
92
+ margin-bottom: 0.5rem;
93
+ font-weight: 700;
94
+ text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
95
+ }
96
+
97
+ .main-header p {
98
+ font-size: 1.2rem;
99
+ opacity: 0.9;
100
+ margin: 0;
101
+ }
102
+
103
+ /* Status Indicators */
104
+ .status-indicator {
105
+ display: inline-flex;
106
+ align-items: center;
107
+ padding: 0.25rem 0.75rem;
108
+ border-radius: 20px;
109
+ font-size: 0.85rem;
110
+ font-weight: 500;
111
+ margin: 0.25rem;
112
+ }
113
+
114
+ .status-success {
115
+ background: linear-gradient(135deg, #11998e, #38ef7d);
116
+ color: white;
117
+ }
118
+
119
+ .status-warning {
120
+ background: linear-gradient(135deg, #f093fb, #f5576c);
121
+ color: white;
122
+ }
123
+
124
+ .status-info {
125
+ background: linear-gradient(135deg, #4facfe, #00f2fe);
126
+ color: white;
127
+ }
128
+
129
+ .status-error {
130
+ background: linear-gradient(135deg, #ff416c, #ff4b2b);
131
+ color: white;
132
+ }
133
+
134
+ /* Card Styles */
135
+ .metric-card {
136
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
137
+ padding: 1.5rem;
138
+ border-radius: 15px;
139
+ color: white;
140
+ text-align: center;
141
+ margin: 0.5rem 0;
142
+ box-shadow: 0 10px 30px rgba(102, 126, 234, 0.3);
143
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
144
+ position: relative;
145
+ overflow: hidden;
146
+ }
147
+
148
+ .metric-card:hover {
149
+ transform: translateY(-10px) scale(1.02);
150
+ box-shadow: 0 20px 50px rgba(102, 126, 234, 0.4);
151
+ }
152
+
153
+ .metric-value {
154
+ font-size: 2.5rem;
155
+ font-weight: 700;
156
+ margin: 0.5rem 0;
157
+ text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
158
+ }
159
+
160
+ .metric-label {
161
+ font-size: 1rem;
162
+ opacity: 0.9;
163
+ text-transform: uppercase;
164
+ letter-spacing: 1px;
165
+ }
166
+
167
+ /* Feature Cards */
168
+ .feature-card {
169
+ background: rgba(255, 255, 255, 0.95);
170
+ backdrop-filter: blur(10px);
171
+ padding: 2rem;
172
+ border-radius: 20px;
173
+ box-shadow: 0 15px 35px rgba(0, 0, 0, 0.1);
174
+ margin: 1rem 0;
175
+ border: 1px solid rgba(255, 255, 255, 0.2);
176
+ transition: all 0.3s ease;
177
+ position: relative;
178
+ }
179
+
180
+ .feature-card:hover {
181
+ transform: translateY(-5px);
182
+ box-shadow: 0 25px 50px rgba(0, 0, 0, 0.15);
183
+ }
184
+
185
+ .feature-card::before {
186
+ content: '';
187
+ position: absolute;
188
+ top: 0;
189
+ left: 0;
190
+ right: 0;
191
+ height: 4px;
192
+ background: linear-gradient(90deg, #667eea, #764ba2);
193
+ border-radius: 20px 20px 0 0;
194
+ }
195
+
196
+ /* Hide Streamlit elements */
197
+ #MainMenu { visibility: hidden; }
198
+ footer { visibility: hidden; }
199
+ header { visibility: hidden; }
200
+ .stDeployButton { display: none; }
201
+
202
+ /* Responsive Design */
203
+ @media (max-width: 768px) {
204
+ .main-header h1 { font-size: 1.8rem; }
205
+ .main-header p { font-size: 1rem; }
206
+ .metric-card { margin: 0.25rem 0; }
207
+ .feature-card { padding: 1rem; }
208
+ }
209
+ </style>
210
+ """, unsafe_allow_html=True)
211
+
212
+ # Enhanced Database Manager Class with Robust Error Handling
213
+ class DatabaseManager:
214
+ def __init__(self, db_path: str = None):
215
+ """
216
+ Initialize DatabaseManager with robust error handling
217
+
218
+ Args:
219
+ db_path (str, optional): Custom database path. If None, uses auto-detection.
220
+ """
221
+ # Set up logging
222
+ self.logger = logging.getLogger(__name__)
223
+
224
+ # Set database path with fallbacks
225
+ if db_path:
226
+ self.db_path = db_path
227
+ else:
228
+ # Try multiple fallback locations
229
+ possible_paths = [
230
+ "./data/iran_legal.db", # Preferred location
231
+ "/tmp/iran_legal.db", # Temp directory (for cloud/container environments)
232
+ os.path.expanduser("~/iran_legal.db"), # User home directory
233
+ "./iran_legal.db" # Current directory
234
+ ]
235
+
236
+ self.db_path = self._find_writable_path(possible_paths)
237
+
238
+ self.logger.info(f"Using database path: {self.db_path}")
239
+ self.initialize_database()
240
+
241
+ def _find_writable_path(self, paths):
242
+ """
243
+ Find the first writable path from a list of potential paths
244
+
245
+ Args:
246
+ paths (list): List of potential database paths
247
+
248
+ Returns:
249
+ str: First writable path found
250
+ """
251
+ for path in paths:
252
+ try:
253
+ # Create directory if it doesn't exist
254
+ directory = os.path.dirname(path)
255
+ if directory and not os.path.exists(directory):
256
+ os.makedirs(directory, exist_ok=True)
257
+
258
+ # Test if we can write to this location
259
+ test_file = path + ".test"
260
+ with open(test_file, 'w') as f:
261
+ f.write("test")
262
+ os.remove(test_file)
263
+
264
+ self.logger.info(f"Found writable path: {path}")
265
+ return path
266
+
267
+ except (OSError, PermissionError) as e:
268
+ self.logger.warning(f"Cannot write to {path}: {e}")
269
+ continue
270
+
271
+ # If no writable path found, default to current directory
272
+ default_path = "./iran_legal.db"
273
+ self.logger.warning(f"No writable path found, using default: {default_path}")
274
+ return default_path
275
+
276
+ def initialize_database(self):
277
+ """Initialize the database with required tables"""
278
+ try:
279
+ # Ensure the directory exists
280
+ directory = os.path.dirname(self.db_path)
281
+ if directory and not os.path.exists(directory):
282
+ os.makedirs(directory, exist_ok=True)
283
+ self.logger.info(f"Created directory: {directory}")
284
+
285
+ # Test database connection
286
+ with sqlite3.connect(self.db_path, timeout=10.0) as conn:
287
+ # Enable WAL mode for better concurrency
288
+ conn.execute("PRAGMA journal_mode=WAL;")
289
+ conn.execute("PRAGMA foreign_keys = ON")
290
+
291
+ # Test basic functionality
292
+ cursor = conn.cursor()
293
+ cursor.execute("SELECT sqlite_version();")
294
+ version = cursor.fetchone()[0]
295
+ self.logger.info(f"SQLite version: {version}")
296
+
297
+ # Create tables
298
+ self._create_tables(conn)
299
+
300
+ self.logger.info("Database initialized successfully")
301
+
302
+ except sqlite3.OperationalError as e:
303
+ self.logger.error(f"SQLite operational error: {e}")
304
+ self._handle_database_error(e)
305
+
306
+ except PermissionError as e:
307
+ self.logger.error(f"Permission error accessing database: {e}")
308
+ self._handle_permission_error()
309
+
310
+ except Exception as e:
311
+ self.logger.error(f"Unexpected error initializing database: {e}")
312
+ raise
313
+
314
+ def _create_tables(self, conn):
315
+ """Create necessary database tables"""
316
+ try:
317
+ cursor = conn.cursor()
318
+
319
+ # Documents table
320
+ cursor.execute("""
321
+ CREATE TABLE IF NOT EXISTS documents (
322
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
323
+ title TEXT NOT NULL,
324
+ content TEXT NOT NULL,
325
+ source TEXT,
326
+ category TEXT,
327
+ ai_score REAL DEFAULT 0.0,
328
+ keywords TEXT,
329
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
330
+ file_size INTEGER DEFAULT 0,
331
+ language TEXT DEFAULT 'fa'
332
+ )
333
+ """)
334
+
335
+ # Scraped items table
336
+ cursor.execute("""
337
+ CREATE TABLE IF NOT EXISTS scraped_items (
338
+ id TEXT PRIMARY KEY,
339
+ url TEXT NOT NULL,
340
+ title TEXT,
341
+ content TEXT,
342
+ domain TEXT,
343
+ rating_score REAL DEFAULT 0.0,
344
+ word_count INTEGER DEFAULT 0,
345
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
346
+ status TEXT DEFAULT 'completed'
347
+ )
348
+ """)
349
+
350
+ # Add indexes for performance
351
+ cursor.execute("""
352
+ CREATE INDEX IF NOT EXISTS idx_documents_category
353
+ ON documents(category);
354
+ """)
355
+
356
+ cursor.execute("""
357
+ CREATE INDEX IF NOT EXISTS idx_documents_ai_score
358
+ ON documents(ai_score);
359
+ """)
360
+
361
+ cursor.execute("""
362
+ CREATE INDEX IF NOT EXISTS idx_scraped_domain
363
+ ON scraped_items(domain);
364
+ """)
365
+
366
+ conn.commit()
367
+ self.logger.info("Database tables created/verified")
368
+
369
+ except sqlite3.Error as e:
370
+ self.logger.error(f"Error creating tables: {e}")
371
+ raise
372
+
373
+ def _handle_database_error(self, error):
374
+ """Handle SQLite operational errors"""
375
+ error_msg = str(error).lower()
376
+
377
+ if "database is locked" in error_msg:
378
+ self.logger.error("Database is locked. Attempting recovery...")
379
+ # Attempt to recover by trying a different path
380
+ self.db_path = f"/tmp/iran_legal_{os.getpid()}.db"
381
+ self.logger.info(f"Attempting recovery with new path: {self.db_path}")
382
+
383
+ elif "disk i/o error" in error_msg:
384
+ self.logger.error("Disk I/O error. Check disk space and permissions.")
385
+
386
+ elif "database disk image is malformed" in error_msg:
387
+ self.logger.error("Database file is corrupted. Attempting backup and recreation...")
388
+
389
+ else:
390
+ self.logger.error(f"Unknown database error: {error}")
391
+
392
+ # Re-raise the error after logging
393
+ raise error
394
+
395
+ def _handle_permission_error(self):
396
+ """Handle permission errors when accessing the database"""
397
+ self.logger.error("Permission denied accessing database path")
398
+
399
+ # Try fallback to temp directory
400
+ fallback_path = f"/tmp/iran_legal_{os.getpid()}.db"
401
+ self.logger.info(f"Attempting fallback to: {fallback_path}")
402
+ self.db_path = fallback_path
403
+
404
+ # Retry initialization with fallback path
405
+ try:
406
+ with sqlite3.connect(self.db_path) as conn:
407
+ self._create_tables(conn)
408
+ self.logger.info("Successfully initialized with fallback path")
409
+ except Exception as e:
410
+ self.logger.error(f"Fallback also failed: {e}")
411
+ raise
412
+
413
+ @contextmanager
414
+ def get_connection(self):
415
+ """Get database connection with proper error handling"""
416
+ conn = None
417
+ try:
418
+ conn = sqlite3.connect(self.db_path, timeout=10.0)
419
+ conn.row_factory = sqlite3.Row
420
+ yield conn
421
+ except Exception as e:
422
+ self.logger.error(f"Database connection error: {e}")
423
+ raise
424
+ finally:
425
+ if conn:
426
+ conn.close()
427
+
428
+ def health_check(self):
429
+ """Perform a health check on the database"""
430
+ try:
431
+ with self.get_connection() as conn:
432
+ cursor = conn.cursor()
433
+
434
+ # Basic connectivity test
435
+ cursor.execute("SELECT 1;")
436
+
437
+ # Check database integrity
438
+ cursor.execute("PRAGMA integrity_check;")
439
+ integrity = cursor.fetchone()[0]
440
+
441
+ # Get database info
442
+ cursor.execute("PRAGMA page_count;")
443
+ page_count = cursor.fetchone()[0]
444
+
445
+ cursor.execute("PRAGMA page_size;")
446
+ page_size = cursor.fetchone()[0]
447
+
448
+ size_mb = (page_count * page_size) / (1024 * 1024)
449
+
450
+ return {
451
+ "status": "healthy",
452
+ "path": self.db_path,
453
+ "integrity": integrity,
454
+ "size_mb": round(size_mb, 2),
455
+ "writable": os.access(os.path.dirname(self.db_path) or ".", os.W_OK)
456
+ }
457
+
458
+ except Exception as e:
459
+ return {
460
+ "status": "unhealthy",
461
+ "error": str(e),
462
+ "path": self.db_path
463
+ }
464
+
465
+ def add_document(self, doc_data: Dict) -> int:
466
+ """Add a new document to the database"""
467
+ try:
468
+ with self.get_connection() as conn:
469
+ cursor = conn.execute("""
470
+ INSERT INTO documents (title, content, source, category, ai_score, keywords, file_size)
471
+ VALUES (?, ?, ?, ?, ?, ?, ?)
472
+ """, (
473
+ doc_data.get('title', ''),
474
+ doc_data.get('content', ''),
475
+ doc_data.get('source', ''),
476
+ doc_data.get('category', ''),
477
+ doc_data.get('ai_score', 0.0),
478
+ json.dumps(doc_data.get('keywords', [])),
479
+ doc_data.get('file_size', 0)
480
+ ))
481
+ doc_id = cursor.lastrowid
482
+ conn.commit()
483
+ return doc_id
484
+ except Exception as e:
485
+ self.logger.error(f"Error adding document: {e}")
486
+ return 0
487
+
488
+ def add_scraped_item(self, item_data: Dict) -> bool:
489
+ """Add a scraped item to the database"""
490
+ try:
491
+ with self.get_connection() as conn:
492
+ conn.execute("""
493
+ INSERT OR REPLACE INTO scraped_items
494
+ (id, url, title, content, domain, rating_score, word_count, status)
495
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
496
+ """, (
497
+ item_data.get('id', ''),
498
+ item_data.get('url', ''),
499
+ item_data.get('title', ''),
500
+ item_data.get('content', ''),
501
+ item_data.get('domain', ''),
502
+ item_data.get('rating_score', 0.0),
503
+ item_data.get('word_count', 0),
504
+ item_data.get('status', 'completed')
505
+ ))
506
+ conn.commit()
507
+ return True
508
+ except Exception as e:
509
+ self.logger.error(f"Error adding scraped item: {e}")
510
+ return False
511
+
512
+ def get_documents(self, limit: int = 100) -> List[Dict]:
513
+ """Get documents from database"""
514
+ try:
515
+ with self.get_connection() as conn:
516
+ cursor = conn.execute("""
517
+ SELECT * FROM documents
518
+ ORDER BY created_at DESC
519
+ LIMIT ?
520
+ """, (limit,))
521
+ return [dict(row) for row in cursor.fetchall()]
522
+ except Exception as e:
523
+ self.logger.error(f"Error getting documents: {e}")
524
+ return []
525
+
526
+ def get_scraped_items(self, limit: int = 100) -> List[Dict]:
527
+ """Get scraped items from database"""
528
+ try:
529
+ with self.get_connection() as conn:
530
+ cursor = conn.execute("""
531
+ SELECT * FROM scraped_items
532
+ ORDER BY created_at DESC
533
+ LIMIT ?
534
+ """, (limit,))
535
+ return [dict(row) for row in cursor.fetchall()]
536
+ except Exception as e:
537
+ self.logger.error(f"Error getting scraped items: {e}")
538
+ return []
539
+
540
+ def search_content(self, query: str, limit: int = 50) -> List[Dict]:
541
+ """Search in documents and scraped items"""
542
+ results = []
543
+ try:
544
+ with self.get_connection() as conn:
545
+ # Search in documents
546
+ cursor = conn.execute("""
547
+ SELECT 'document' as type, id, title, content, ai_score as score, created_at
548
+ FROM documents
549
+ WHERE title LIKE ? OR content LIKE ?
550
+ ORDER BY ai_score DESC
551
+ LIMIT ?
552
+ """, (f'%{query}%', f'%{query}%', limit//2))
553
+ results.extend([dict(row) for row in cursor.fetchall()])
554
+
555
+ # Search in scraped items
556
+ cursor = conn.execute("""
557
+ SELECT 'scraped' as type, id, title, content, rating_score as score, created_at
558
+ FROM scraped_items
559
+ WHERE title LIKE ? OR content LIKE ?
560
+ ORDER BY rating_score DESC
561
+ LIMIT ?
562
+ """, (f'%{query}%', f'%{query}%', limit//2))
563
+ results.extend([dict(row) for row in cursor.fetchall()])
564
+
565
+ except Exception as e:
566
+ self.logger.error(f"Error searching content: {e}")
567
+
568
+ return sorted(results, key=lambda x: x.get('score', 0), reverse=True)[:limit]
569
+
570
+ def get_statistics(self) -> Dict:
571
+ """Get database statistics"""
572
+ stats = {
573
+ 'total_documents': 0,
574
+ 'total_scraped': 0,
575
+ 'avg_ai_score': 0.0,
576
+ 'avg_rating': 0.0,
577
+ 'categories': {}
578
+ }
579
+
580
+ try:
581
+ with self.get_connection() as conn:
582
+ # Total documents
583
+ cursor = conn.execute("SELECT COUNT(*) FROM documents")
584
+ stats['total_documents'] = cursor.fetchone()[0]
585
+
586
+ # Average AI score
587
+ cursor = conn.execute("SELECT AVG(ai_score) FROM documents WHERE ai_score > 0")
588
+ result = cursor.fetchone()[0]
589
+ stats['avg_ai_score'] = result if result else 0.0
590
+
591
+ # Total scraped items
592
+ cursor = conn.execute("SELECT COUNT(*) FROM scraped_items")
593
+ stats['total_scraped'] = cursor.fetchone()[0]
594
+
595
+ # Average rating
596
+ cursor = conn.execute("SELECT AVG(rating_score) FROM scraped_items WHERE rating_score > 0")
597
+ result = cursor.fetchone()[0]
598
+ stats['avg_rating'] = result if result else 0.0
599
+
600
+ # Categories
601
+ cursor = conn.execute("""
602
+ SELECT category, COUNT(*)
603
+ FROM documents
604
+ WHERE category IS NOT NULL
605
+ GROUP BY category
606
+ """)
607
+ stats['categories'] = dict(cursor.fetchall())
608
+
609
+ except Exception as e:
610
+ self.logger.error(f"Error getting statistics: {e}")
611
+
612
+ return stats
613
+
614
+ # AI Analysis Engine (same as before)
615
+ class AIAnalysisEngine:
616
+ def __init__(self):
617
+ self.legal_keywords = {
618
+ 'قانون': ['قانون', 'ماده', 'تبصره', 'بند', 'فصل', 'باب', 'مصوبه'],
619
+ 'قرارداد': ['قرارداد', 'عقد', 'طرفین', 'متعاهدین', 'شرایط', 'مفاد'],
620
+ 'حکم': ['حکم', 'رای', 'دادگاه', 'قاضی', 'شعبه', 'دعوا', 'خواهان'],
621
+ 'اداری': ['اداره', 'سازمان', 'وزارت', 'دولت', 'مقررات', 'بخشنامه']
622
+ }
623
+
624
+ def analyze_text(self, text: str, title: str = "") -> Dict:
625
+ """Analyze text and return comprehensive analysis"""
626
+ if not text:
627
+ return {'ai_score': 0.0, 'category': 'نامشخص', 'keywords': []}
628
+
629
+ # Calculate quality score
630
+ quality_score = self._calculate_quality_score(text)
631
+
632
+ # Predict category
633
+ category = self._predict_category(text + " " + title)
634
+
635
+ # Extract keywords
636
+ keywords = self._extract_keywords(text)
637
+
638
+ # Detect language
639
+ language = self._detect_language(text)
640
+
641
+ return {
642
+ 'ai_score': quality_score,
643
+ 'category': category,
644
+ 'keywords': keywords,
645
+ 'language': language,
646
+ 'word_count': len(text.split()),
647
+ 'char_count': len(text)
648
+ }
649
+
650
+ def _calculate_quality_score(self, text: str) -> float:
651
+ """Calculate text quality score"""
652
+ score = 0.0
653
+ word_count = len(text.split())
654
+
655
+ # Length scoring
656
+ if 50 <= word_count <= 5000:
657
+ score += 0.3
658
+ elif word_count > 5000:
659
+ score += 0.2
660
+ elif word_count >= 20:
661
+ score += 0.1
662
+
663
+ # Legal terms scoring
664
+ legal_term_count = 0
665
+ for category_terms in self.legal_keywords.values():
666
+ for term in category_terms:
667
+ legal_term_count += text.count(term)
668
+
669
+ if legal_term_count >= 5:
670
+ score += 0.4
671
+ elif legal_term_count >= 2:
672
+ score += 0.2
673
+ elif legal_term_count >= 1:
674
+ score += 0.1
675
+
676
+ # Structure scoring
677
+ if re.search(r'ماده\s*\d+', text):
678
+ score += 0.1
679
+ if re.search(r'بند\s*[الف-ی]', text):
680
+ score += 0.05
681
+ if re.search(r'\d{1,2}/\d{1,2}/\d{2,4}', text):
682
+ score += 0.05
683
+
684
+ # Language quality
685
+ persian_ratio = len(re.findall(r'[\u0600-\u06FF]', text)) / max(len(text), 1)
686
+ if persian_ratio > 0.5:
687
+ score += 0.1
688
+
689
+ return min(score, 1.0)
690
+
691
+ def _predict_category(self, text: str) -> str:
692
+ """Predict document category"""
693
+ text_lower = text.lower()
694
+ category_scores = {}
695
+
696
+ for category, keywords in self.legal_keywords.items():
697
+ score = sum(text_lower.count(keyword) for keyword in keywords)
698
+ category_scores[category] = score
699
+
700
+ if category_scores:
701
+ best_category = max(category_scores, key=category_scores.get)
702
+ return best_category if category_scores[best_category] > 0 else 'عمومی'
703
+
704
+ return 'عمومی'
705
+
706
+ def _extract_keywords(self, text: str, max_keywords: int = 10) -> List[str]:
707
+ """Extract keywords from text"""
708
+ # Simple keyword extraction based on frequency
709
+ words = re.findall(r'[\u0600-\u06FF]{3,}', text)
710
+ word_freq = {}
711
+
712
+ for word in words:
713
+ if len(word) > 2:
714
+ word_freq[word] = word_freq.get(word, 0) + 1
715
+
716
+ # Sort by frequency and return top keywords
717
+ sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
718
+ return [word[0] for word in sorted_words[:max_keywords]]
719
+
720
+ def _detect_language(self, text: str) -> str:
721
+ """Detect text language"""
722
+ persian_chars = len(re.findall(r'[\u0600-\u06FF]', text))
723
+ english_chars = len(re.findall(r'[a-zA-Z]', text))
724
+
725
+ if persian_chars > english_chars:
726
+ return 'fa'
727
+ elif english_chars > 0:
728
+ return 'en'
729
+ else:
730
+ return 'unknown'
731
+
732
+ # Web Scraping Service (same as before, keeping it brief for space)
733
+ class WebScrapingService:
734
+ def __init__(self):
735
+ self.session = requests.Session()
736
+ self.session.headers.update({
737
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
738
+ })
739
+
740
+ def scrape_url(self, url: str) -> Dict:
741
+ """Scrape content from a single URL"""
742
+ try:
743
+ response = self.session.get(url, timeout=15)
744
+ response.raise_for_status()
745
+
746
+ soup = BeautifulSoup(response.content, 'html.parser')
747
+
748
+ # Remove unwanted elements
749
+ for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
750
+ element.decompose()
751
+
752
+ # Extract title
753
+ title_tag = soup.find('title')
754
+ title = title_tag.get_text().strip() if title_tag else "بدون عنوان"
755
+
756
+ # Extract main content
757
+ content = self._extract_main_content(soup)
758
+
759
+ if not content or len(content.strip()) < 50:
760
+ return None
761
+
762
+ # Clean content
763
+ content = re.sub(r'\s+', ' ', content).strip()
764
+
765
+ return {
766
+ 'url': url,
767
+ 'title': title,
768
+ 'content': content,
769
+ 'domain': urlparse(url).netloc,
770
+ 'word_count': len(content.split()),
771
+ 'success': True
772
+ }
773
+
774
+ except Exception as e:
775
+ logger.error(f"Error scraping {url}: {e}")
776
+ return None
777
+
778
+ def _extract_main_content(self, soup: BeautifulSoup) -> str:
779
+ """Extract main content from soup object"""
780
+ # Try different content selectors
781
+ content_selectors = [
782
+ 'article', '.content', '.main-content', '#content',
783
+ '.post-content', '.entry-content', 'main', '.container'
784
+ ]
785
+
786
+ content = ""
787
+ for selector in content_selectors:
788
+ elements = soup.select(selector)
789
+ if elements:
790
+ content = ' '.join([elem.get_text().strip() for elem in elements])
791
+ break
792
+
793
+ # Fallback to body
794
+ if not content:
795
+ body = soup.find('body')
796
+ if body:
797
+ content = body.get_text()
798
+
799
+ return content
800
+
801
+ # Rating Service (same as before)
802
+ class RatingService:
803
+ def __init__(self):
804
+ self.trusted_domains = {
805
+ 'dastour.ir': 1.0,
806
+ 'mizanonline.ir': 0.9,
807
+ 'judiciary.ir': 1.0,
808
+ 'majlis.ir': 1.0,
809
+ 'dolat.ir': 0.8,
810
+ 'rc.majlis.ir': 0.9
811
+ }
812
+
813
+ def rate_content(self, content_data: Dict) -> float:
814
+ """Rate content quality based on multiple factors"""
815
+ score = 0.0
816
+
817
+ # Source credibility (40%)
818
+ domain = content_data.get('domain', '')
819
+ source_score = self.trusted_domains.get(domain, 0.3)
820
+ if '.gov.ir' in domain:
821
+ source_score = max(source_score, 0.7)
822
+ elif '.ir' in domain:
823
+ source_score = max(source_score, 0.4)
824
+
825
+ score += source_score * 0.4
826
+
827
+ # Content quality (40%)
828
+ content = content_data.get('content', '')
829
+ word_count = len(content.split())
830
+
831
+ # Length factor
832
+ if word_count >= 200:
833
+ length_factor = 0.3
834
+ elif word_count >= 100:
835
+ length_factor = 0.2
836
+ elif word_count >= 50:
837
+ length_factor = 0.1
838
+ else:
839
+ length_factor = 0.0
840
+
841
+ # Legal terms factor
842
+ legal_terms = ['قانون', 'حقوق', 'دادگاه', 'ماده', 'حکم', 'رای']
843
+ found_terms = sum(1 for term in legal_terms if term in content)
844
+ terms_factor = min(found_terms / len(legal_terms), 1.0) * 0.1
845
+
846
+ score += (length_factor + terms_factor) * 0.4
847
+
848
+ # Title relevance (20%)
849
+ title = content_data.get('title', '')
850
+ title_score = 0.1 if len(title) > 10 else 0.0
851
+ if any(term in title for term in legal_terms):
852
+ title_score += 0.1
853
+
854
+ score += title_score * 0.2
855
+
856
+ return min(score, 1.0)
857
+
858
+ # UI Helper Functions
859
+ def show_status_message(message: str, status_type: str = "info"):
860
+ """Show styled status message"""
861
+ status_class = f"status-{status_type}"
862
+ st.markdown(f'<div class="status-indicator {status_class}">{message}</div>', unsafe_allow_html=True)
863
+
864
+ def create_metric_card(title: str, value: str, subtitle: str = ""):
865
+ """Create a beautiful metric card"""
866
+ return f"""
867
+ <div class="metric-card">
868
+ <div class="metric-label">{title}</div>
869
+ <div class="metric-value">{value}</div>
870
+ {f'<div style="font-size: 0.9rem; opacity: 0.8;">{subtitle}</div>' if subtitle else ''}
871
+ </div>
872
+ """
873
+
874
+ # Enhanced Initialize services with health check
875
+ @st.cache_resource
876
+ def initialize_services():
877
+ """Initialize all services with health check"""
878
+ try:
879
+ # Initialize database manager with fallback paths
880
+ db_manager = DatabaseManager()
881
+
882
+ # Perform health check
883
+ health = db_manager.health_check()
884
+ if health["status"] == "unhealthy":
885
+ st.error(f"❌ Database health check failed: {health['error']}")
886
+ st.warning("⚠️ The application will continue with limited functionality.")
887
+ else:
888
+ st.success(f"✅ Database initialized successfully (Size: {health['size_mb']} MB)")
889
+
890
+ # Initialize other services
891
+ ai_engine = AIAnalysisEngine()
892
+ scraping_service = WebScrapingService()
893
+ rating_service = RatingService()
894
+
895
+ return db_manager, ai_engine, scraping_service, rating_service
896
+
897
+ except Exception as e:
898
+ st.error(f"❌ Failed to initialize services: {e}")
899
+ st.info("Please check the logs for more details.")
900
+
901
+ # Return None objects to prevent further errors
902
+ return None, None, None, None
903
+
904
+ def debug_database_environment():
905
+ """Debug function to check database environment"""
906
+ st.markdown("### 🔧 Database Environment Debug")
907
+
908
+ debug_info = {
909
+ "Current working directory": os.getcwd(),
910
+ "Python executable": sys.executable,
911
+ "Operating system": os.name,
912
+ "User": os.getenv('USER', os.getenv('USERNAME', 'unknown')),
913
+ }
914
+
915
+ for key, value in debug_info.items():
916
+ st.write(f"**{key}:** {value}")
917
+
918
+ # Check common paths
919
+ st.markdown("#### 📁 Path Accessibility Check")
920
+ paths_to_check = [
921
+ "./",
922
+ "./data/",
923
+ "/tmp/",
924
+ os.path.expanduser("~/")
925
+ ]
926
+
927
+ for path in paths_to_check:
928
+ try:
929
+ writable = os.access(path, os.W_OK) if os.path.exists(path) else False
930
+ exists = os.path.exists(path)
931
+
932
+ if exists and writable:
933
+ st.success(f"✅ {path} - Exists: {exists}, Writable: {writable}")
934
+ elif exists:
935
+ st.warning(f"⚠️ {path} - Exists: {exists}, Writable: {writable}")
936
+ else:
937
+ st.error(f"❌ {path} - Exists: {exists}, Writable: {writable}")
938
+
939
+ except Exception as e:
940
+ st.error(f"❌ {path} - Error checking: {e}")
941
+
942
+ # Check SQLite
943
+ try:
944
+ import sqlite3
945
+ st.success(f"✅ SQLite version: {sqlite3.sqlite_version}")
946
+ except ImportError:
947
+ st.error("❌ SQLite not available")
948
+
949
+ # Main Application
950
+ def main():
951
+ load_css()
952
+
953
+ # Header
954
+ st.markdown("""
955
+ <div class="main-header">
956
+ <h1>🏛️ داشبورد اطلاعات حقوقی جمهوری اسلامی ایران</h1>
957
+ <p>سیستم جامع جمع‌آوری، تحلیل و رتبه‌بندی اطلاعات حقوقی با هوش مصنوعی</p>
958
+ </div>
959
+ """, unsafe_allow_html=True)
960
+
961
+ # Initialize services
962
+ db_manager, ai_engine, scraping_service, rating_service = initialize_services()
963
+
964
+ # Show debug info if database failed
965
+ if db_manager is None:
966
+ st.warning("⚠️ Database initialization failed. Showing debug information:")
967
+ debug_database_environment()
968
+ return
969
+
970
+ # Sidebar navigation
971
+ st.sidebar.markdown("### 📋 منوی اصلی")
972
+
973
+ pages = {
974
+ "🏠 داشبورد اصلی": "dashboard",
975
+ "🌐 اسکرپینگ وب": "scraping",
976
+ "📄 مدیریت اسناد": "documents",
977
+ "🔍 جستجو و تحلیل": "search",
978
+ "📊 گزارشات و آمار": "reports",
979
+ "🔧 تنظیمات و دیباگ": "debug"
980
+ }
981
+
982
+ selected_page = st.sidebar.selectbox("انتخاب صفحه:", list(pages.keys()))
983
+ page_key = pages[selected_page]
984
+
985
+ # Route to appropriate page
986
+ if page_key == "dashboard":
987
+ show_dashboard(db_manager)
988
+ elif page_key == "debug":
989
+ debug_database_environment()
990
+ # Add other page handlers here...
991
+
992
+ def show_dashboard(db_manager: DatabaseManager):
993
+ """Display main dashboard"""
994
+ # Get statistics
995
+ stats = db_manager.get_statistics()
996
+
997
+ # Metrics row
998
+ col1, col2, col3, col4 = st.columns(4)
999
+
1000
+ with col1:
1001
+ st.markdown(create_metric_card(
1002
+ "کل اسناد",
1003
+ str(stats['total_documents']),
1004
+ "اسناد ثبت شده"
1005
+ ), unsafe_allow_html=True)
1006
+
1007
+ with col2:
1008
+ st.markdown(create_metric_card(
1009
+ "محتوای وب",
1010
+ str(stats['total_scraped']),
1011
+ "آیتم جمع‌آوری شده"
1012
+ ), unsafe_allow_html=True)
1013
+
1014
+ with col3:
1015
+ st.markdown(create_metric_card(
1016
+ "کیفیت AI",
1017
+ f"{stats['avg_ai_score']:.2f}",
1018
+ "میانگین امتیاز"
1019
+ ), unsafe_allow_html=True)
1020
+
1021
+ with col4:
1022
+ st.markdown(create_metric_card(
1023
+ "رتبه‌بندی",
1024
+ f"{stats['avg_rating']:.2f}",
1025
+ "میانگین کیفیت"
1026
+ ), unsafe_allow_html=True)
1027
+
1028
+ # Database health status
1029
+ st.markdown("---")
1030
+ health = db_manager.health_check()
1031
+
1032
+ if health["status"] == "healthy":
1033
+ show_status_message(f"✅ Database Status: Healthy (Path: {health['path']})", "success")
1034
+ else:
1035
+ show_status_message(f"❌ Database Status: Unhealthy - {health['error']}", "error")
1036
+
1037
+ # Run the application
1038
+ if __name__ == "__main__":
1039
+ main()