Really-amin commited on
Commit
ccbebf0
·
verified ·
1 Parent(s): 5ccfb05

Rename src/streamlit_app.py to streamlit_app.py

Browse files
src/streamlit_app.py → streamlit_app.py RENAMED
@@ -1,8 +1,9 @@
1
  #!/usr/bin/env python3
2
  """
3
- Iran Legal Information Dashboard
4
- ================================
5
- Complete Working System for Legal Document Management, OCR, AI Analysis, and Web Scraping
 
6
  """
7
 
8
  import streamlit as st
@@ -19,6 +20,7 @@ import logging
19
  import time
20
  import re
21
  import asyncio
 
22
  from datetime import datetime, timedelta
23
  from typing import Dict, List, Optional, Any, Tuple
24
  from urllib.parse import urlparse, urljoin
@@ -40,7 +42,7 @@ st.set_page_config(
40
  initial_sidebar_state="expanded"
41
  )
42
 
43
- # Advanced CSS for beautiful UI
44
  def load_css():
45
  st.markdown("""
46
  <style>
@@ -98,6 +100,37 @@ def load_css():
98
  margin: 0;
99
  }
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  /* Card Styles */
102
  .metric-card {
103
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
@@ -112,21 +145,6 @@ def load_css():
112
  overflow: hidden;
113
  }
114
 
115
- .metric-card::before {
116
- content: '';
117
- position: absolute;
118
- top: 0;
119
- left: -100%;
120
- width: 100%;
121
- height: 100%;
122
- background: linear-gradient(90deg, transparent, rgba(255,255,255,0.2), transparent);
123
- transition: left 0.5s;
124
- }
125
-
126
- .metric-card:hover::before {
127
- left: 100%;
128
- }
129
-
130
  .metric-card:hover {
131
  transform: translateY(-10px) scale(1.02);
132
  box-shadow: 0 20px 50px rgba(102, 126, 234, 0.4);
@@ -175,232 +193,12 @@ def load_css():
175
  border-radius: 20px 20px 0 0;
176
  }
177
 
178
- /* Status Indicators */
179
- .status-indicator {
180
- display: inline-flex;
181
- align-items: center;
182
- padding: 0.25rem 0.75rem;
183
- border-radius: 20px;
184
- font-size: 0.85rem;
185
- font-weight: 500;
186
- margin: 0.25rem;
187
- }
188
-
189
- .status-success {
190
- background: linear-gradient(135deg, #11998e, #38ef7d);
191
- color: white;
192
- }
193
-
194
- .status-warning {
195
- background: linear-gradient(135deg, #f093fb, #f5576c);
196
- color: white;
197
- }
198
-
199
- .status-info {
200
- background: linear-gradient(135deg, #4facfe, #00f2fe);
201
- color: white;
202
- }
203
-
204
- .status-error {
205
- background: linear-gradient(135deg, #ff416c, #ff4b2b);
206
- color: white;
207
- }
208
-
209
- /* Upload Area */
210
- .upload-area {
211
- border: 3px dashed #667eea;
212
- border-radius: 20px;
213
- padding: 3rem;
214
- text-align: center;
215
- background: linear-gradient(135deg, rgba(102, 126, 234, 0.05), rgba(118, 75, 162, 0.05));
216
- margin: 2rem 0;
217
- transition: all 0.3s ease;
218
- position: relative;
219
- overflow: hidden;
220
- }
221
-
222
- .upload-area:hover {
223
- border-color: #764ba2;
224
- background: linear-gradient(135deg, rgba(102, 126, 234, 0.1), rgba(118, 75, 162, 0.1));
225
- transform: scale(1.02);
226
- }
227
-
228
- .upload-area::before {
229
- content: '📁';
230
- font-size: 4rem;
231
- display: block;
232
- margin-bottom: 1rem;
233
- animation: bounce 2s infinite;
234
- }
235
-
236
- @keyframes bounce {
237
- 0%, 20%, 50%, 80%, 100% { transform: translateY(0); }
238
- 40% { transform: translateY(-10px); }
239
- 60% { transform: translateY(-5px); }
240
- }
241
-
242
- /* Buttons */
243
- .stButton > button {
244
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
245
- color: white;
246
- border: none;
247
- border-radius: 12px;
248
- padding: 0.75rem 2rem;
249
- font-weight: 600;
250
- font-size: 1rem;
251
- font-family: 'Vazir', sans-serif;
252
- transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
253
- box-shadow: 0 5px 15px rgba(102, 126, 234, 0.3);
254
- position: relative;
255
- overflow: hidden;
256
- }
257
-
258
- .stButton > button:hover {
259
- transform: translateY(-3px);
260
- box-shadow: 0 10px 25px rgba(102, 126, 234, 0.4);
261
- }
262
-
263
- .stButton > button:active {
264
- transform: translateY(-1px);
265
- }
266
-
267
- /* Sidebar */
268
- .css-1d391kg {
269
- background: linear-gradient(180deg, #667eea 0%, #764ba2 100%);
270
- }
271
-
272
- .sidebar .sidebar-content {
273
- background: linear-gradient(180deg, #667eea 0%, #764ba2 100%);
274
- color: white;
275
- }
276
-
277
- /* Data Display */
278
- .data-row {
279
- background: rgba(255, 255, 255, 0.9);
280
- backdrop-filter: blur(10px);
281
- border-radius: 12px;
282
- padding: 1rem;
283
- margin: 0.5rem 0;
284
- border: 1px solid rgba(102, 126, 234, 0.1);
285
- transition: all 0.3s ease;
286
- }
287
-
288
- .data-row:hover {
289
- background: rgba(255, 255, 255, 1);
290
- transform: translateX(-5px);
291
- box-shadow: 0 5px 20px rgba(0, 0, 0, 0.1);
292
- }
293
-
294
- /* Progress Bars */
295
- .progress-container {
296
- background: rgba(255, 255, 255, 0.2);
297
- border-radius: 10px;
298
- padding: 4px;
299
- margin: 1rem 0;
300
- box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.1);
301
- }
302
-
303
- .progress-bar {
304
- background: linear-gradient(90deg, #667eea, #764ba2);
305
- height: 20px;
306
- border-radius: 8px;
307
- transition: width 0.5s cubic-bezier(0.4, 0, 0.2, 1);
308
- position: relative;
309
- overflow: hidden;
310
- }
311
-
312
- .progress-bar::after {
313
- content: '';
314
- position: absolute;
315
- top: 0;
316
- left: -100%;
317
- width: 100%;
318
- height: 100%;
319
- background: linear-gradient(90deg, transparent, rgba(255,255,255,0.3), transparent);
320
- animation: loading 1.5s infinite;
321
- }
322
-
323
- @keyframes loading {
324
- 0% { left: -100%; }
325
- 100% { left: 100%; }
326
- }
327
-
328
- /* Chart Containers */
329
- .chart-container {
330
- background: rgba(255, 255, 255, 0.95);
331
- backdrop-filter: blur(10px);
332
- border-radius: 20px;
333
- padding: 1.5rem;
334
- margin: 1rem 0;
335
- box-shadow: 0 10px 30px rgba(0, 0, 0, 0.1);
336
- border: 1px solid rgba(255, 255, 255, 0.2);
337
- }
338
-
339
- /* Search Box */
340
- .stTextInput > div > div > input {
341
- background: rgba(255, 255, 255, 0.9);
342
- border: 2px solid #667eea;
343
- border-radius: 12px;
344
- padding: 0.75rem;
345
- font-family: 'Vazir', sans-serif;
346
- transition: all 0.3s ease;
347
- }
348
-
349
- .stTextInput > div > div > input:focus {
350
- border-color: #764ba2;
351
- box-shadow: 0 0 20px rgba(102, 126, 234, 0.3);
352
- transform: scale(1.02);
353
- }
354
-
355
- /* Selectbox */
356
- .stSelectbox > div > div > select {
357
- background: rgba(255, 255, 255, 0.9);
358
- border: 2px solid #667eea;
359
- border-radius: 12px;
360
- font-family: 'Vazir', sans-serif;
361
- }
362
-
363
  /* Hide Streamlit elements */
364
  #MainMenu { visibility: hidden; }
365
  footer { visibility: hidden; }
366
  header { visibility: hidden; }
367
  .stDeployButton { display: none; }
368
 
369
- /* Custom scrollbar */
370
- ::-webkit-scrollbar {
371
- width: 8px;
372
- }
373
-
374
- ::-webkit-scrollbar-track {
375
- background: #f1f1f1;
376
- border-radius: 10px;
377
- }
378
-
379
- ::-webkit-scrollbar-thumb {
380
- background: linear-gradient(135deg, #667eea, #764ba2);
381
- border-radius: 10px;
382
- }
383
-
384
- ::-webkit-scrollbar-thumb:hover {
385
- background: linear-gradient(135deg, #764ba2, #667eea);
386
- }
387
-
388
- /* Loading Animation */
389
- .loading-spinner {
390
- border: 4px solid #f3f3f3;
391
- border-radius: 50%;
392
- border-top: 4px solid #667eea;
393
- width: 40px;
394
- height: 40px;
395
- animation: spin 1s linear infinite;
396
- margin: 20px auto;
397
- }
398
-
399
- @keyframes spin {
400
- 0% { transform: rotate(0deg); }
401
- 100% { transform: rotate(360deg); }
402
- }
403
-
404
  /* Responsive Design */
405
  @media (max-width: 768px) {
406
  .main-header h1 { font-size: 1.8rem; }
@@ -411,64 +209,258 @@ def load_css():
411
  </style>
412
  """, unsafe_allow_html=True)
413
 
414
- # Database Manager Class
415
  class DatabaseManager:
416
- def __init__(self, db_path: str = "iran_legal.db"):
417
- self.db_path = db_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  self.initialize_database()
419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  def initialize_database(self):
421
  """Initialize the database with required tables"""
422
  try:
423
- with sqlite3.connect(self.db_path) as conn:
 
 
 
 
 
 
 
 
 
424
  conn.execute("PRAGMA foreign_keys = ON")
425
 
426
- # Documents table
427
- conn.execute("""
428
- CREATE TABLE IF NOT EXISTS documents (
429
- id INTEGER PRIMARY KEY AUTOINCREMENT,
430
- title TEXT NOT NULL,
431
- content TEXT NOT NULL,
432
- source TEXT,
433
- category TEXT,
434
- ai_score REAL DEFAULT 0.0,
435
- keywords TEXT,
436
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
437
- file_size INTEGER DEFAULT 0,
438
- language TEXT DEFAULT 'fa'
439
- )
440
- """)
441
 
442
- # Scraped items table
443
- conn.execute("""
444
- CREATE TABLE IF NOT EXISTS scraped_items (
445
- id TEXT PRIMARY KEY,
446
- url TEXT NOT NULL,
447
- title TEXT,
448
- content TEXT,
449
- domain TEXT,
450
- rating_score REAL DEFAULT 0.0,
451
- word_count INTEGER DEFAULT 0,
452
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
453
- status TEXT DEFAULT 'completed'
454
- )
455
- """)
456
 
457
- conn.commit()
458
- logger.info("Database initialized successfully")
 
 
 
 
 
 
 
 
459
  except Exception as e:
460
- logger.error(f"Database initialization failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
  raise
462
 
463
  @contextmanager
464
  def get_connection(self):
465
  """Get database connection with proper error handling"""
466
- conn = sqlite3.connect(self.db_path)
467
- conn.row_factory = sqlite3.Row
468
  try:
 
 
469
  yield conn
 
 
 
470
  finally:
471
- conn.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
  def add_document(self, doc_data: Dict) -> int:
474
  """Add a new document to the database"""
@@ -490,7 +482,7 @@ class DatabaseManager:
490
  conn.commit()
491
  return doc_id
492
  except Exception as e:
493
- logger.error(f"Error adding document: {e}")
494
  return 0
495
 
496
  def add_scraped_item(self, item_data: Dict) -> bool:
@@ -514,7 +506,7 @@ class DatabaseManager:
514
  conn.commit()
515
  return True
516
  except Exception as e:
517
- logger.error(f"Error adding scraped item: {e}")
518
  return False
519
 
520
  def get_documents(self, limit: int = 100) -> List[Dict]:
@@ -528,7 +520,7 @@ class DatabaseManager:
528
  """, (limit,))
529
  return [dict(row) for row in cursor.fetchall()]
530
  except Exception as e:
531
- logger.error(f"Error getting documents: {e}")
532
  return []
533
 
534
  def get_scraped_items(self, limit: int = 100) -> List[Dict]:
@@ -542,7 +534,7 @@ class DatabaseManager:
542
  """, (limit,))
543
  return [dict(row) for row in cursor.fetchall()]
544
  except Exception as e:
545
- logger.error(f"Error getting scraped items: {e}")
546
  return []
547
 
548
  def search_content(self, query: str, limit: int = 50) -> List[Dict]:
@@ -571,7 +563,7 @@ class DatabaseManager:
571
  results.extend([dict(row) for row in cursor.fetchall()])
572
 
573
  except Exception as e:
574
- logger.error(f"Error searching content: {e}")
575
 
576
  return sorted(results, key=lambda x: x.get('score', 0), reverse=True)[:limit]
577
 
@@ -615,11 +607,11 @@ class DatabaseManager:
615
  stats['categories'] = dict(cursor.fetchall())
616
 
617
  except Exception as e:
618
- logger.error(f"Error getting statistics: {e}")
619
 
620
  return stats
621
 
622
- # AI Analysis Engine
623
  class AIAnalysisEngine:
624
  def __init__(self):
625
  self.legal_keywords = {
@@ -737,7 +729,7 @@ class AIAnalysisEngine:
737
  else:
738
  return 'unknown'
739
 
740
- # Web Scraping Service
741
  class WebScrapingService:
742
  def __init__(self):
743
  self.session = requests.Session()
@@ -787,14 +779,8 @@ class WebScrapingService:
787
  """Extract main content from soup object"""
788
  # Try different content selectors
789
  content_selectors = [
790
- 'article',
791
- '.content',
792
- '.main-content',
793
- '#content',
794
- '.post-content',
795
- '.entry-content',
796
- 'main',
797
- '.container'
798
  ]
799
 
800
  content = ""
@@ -812,7 +798,7 @@ class WebScrapingService:
812
 
813
  return content
814
 
815
- # Rating Service
816
  class RatingService:
817
  def __init__(self):
818
  self.trusted_domains = {
@@ -869,45 +855,6 @@ class RatingService:
869
 
870
  return min(score, 1.0)
871
 
872
- # File processing utilities
873
- def process_pdf_file(uploaded_file) -> Dict:
874
- """Process uploaded PDF file and extract text"""
875
- try:
876
- # Save file temporarily
877
- with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
878
- tmp_file.write(uploaded_file.read())
879
- tmp_file_path = tmp_file.name
880
-
881
- # Try to extract text using PyMuPDF if available
882
- try:
883
- import fitz
884
- doc = fitz.open(tmp_file_path)
885
- text = ""
886
- for page in doc:
887
- text += page.get_text()
888
- doc.close()
889
- except ImportError:
890
- # Fallback: simple text extraction
891
- text = f"محتوای فایل {uploaded_file.name} (OCR در حال توسعه)"
892
-
893
- # Clean up
894
- os.unlink(tmp_file_path)
895
-
896
- return {
897
- 'success': True,
898
- 'text': text,
899
- 'file_size': uploaded_file.size,
900
- 'filename': uploaded_file.name
901
- }
902
-
903
- except Exception as e:
904
- logger.error(f"Error processing PDF: {e}")
905
- return {
906
- 'success': False,
907
- 'text': '',
908
- 'error': str(e)
909
- }
910
-
911
  # UI Helper Functions
912
  def show_status_message(message: str, status_type: str = "info"):
913
  """Show styled status message"""
@@ -924,15 +871,80 @@ def create_metric_card(title: str, value: str, subtitle: str = ""):
924
  </div>
925
  """
926
 
927
- # Initialize services
928
  @st.cache_resource
929
  def initialize_services():
930
- """Initialize all services"""
931
- db_manager = DatabaseManager()
932
- ai_engine = AIAnalysisEngine()
933
- scraping_service = WebScrapingService()
934
- rating_service = RatingService()
935
- return db_manager, ai_engine, scraping_service, rating_service
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
936
 
937
  # Main Application
938
  def main():
@@ -949,6 +961,12 @@ def main():
949
  # Initialize services
950
  db_manager, ai_engine, scraping_service, rating_service = initialize_services()
951
 
 
 
 
 
 
 
952
  # Sidebar navigation
953
  st.sidebar.markdown("### 📋 منوی اصلی")
954
 
@@ -957,7 +975,8 @@ def main():
957
  "🌐 اسکرپینگ وب": "scraping",
958
  "📄 مدیریت اسناد": "documents",
959
  "🔍 جستجو و تحلیل": "search",
960
- "📊 گزارشات و آمار": "reports"
 
961
  }
962
 
963
  selected_page = st.sidebar.selectbox("انتخاب صفحه:", list(pages.keys()))
@@ -966,18 +985,12 @@ def main():
966
  # Route to appropriate page
967
  if page_key == "dashboard":
968
  show_dashboard(db_manager)
969
- elif page_key == "scraping":
970
- show_scraping_page(db_manager, scraping_service, ai_engine, rating_service)
971
- elif page_key == "documents":
972
- show_documents_page(db_manager, ai_engine)
973
- elif page_key == "search":
974
- show_search_page(db_manager)
975
- elif page_key == "reports":
976
- show_reports_page(db_manager)
977
 
978
  def show_dashboard(db_manager: DatabaseManager):
979
  """Display main dashboard"""
980
-
981
  # Get statistics
982
  stats = db_manager.get_statistics()
983
 
@@ -1012,648 +1025,14 @@ def show_dashboard(db_manager: DatabaseManager):
1012
  "میانگین کیفیت"
1013
  ), unsafe_allow_html=True)
1014
 
 
1015
  st.markdown("---")
 
1016
 
1017
- # Charts section
1018
- col1, col2 = st.columns(2)
1019
-
1020
- with col1:
1021
- st.markdown('<div class="chart-container">', unsafe_allow_html=True)
1022
- st.subheader("📊 توزیع دسته‌بندی اسناد")
1023
-
1024
- if stats['categories']:
1025
- df_categories = pd.DataFrame(
1026
- list(stats['categories'].items()),
1027
- columns=['دسته‌بندی', 'تعداد']
1028
- )
1029
- fig = px.pie(df_categories, values='تعداد', names='دسته‌بندی',
1030
- title="توزیع اسناد بر اساس دسته‌بندی")
1031
- fig.update_traces(textposition='inside', textinfo='percent+label')
1032
- st.plotly_chart(fig, use_container_width=True)
1033
- else:
1034
- st.info("هنوز اسنادی دسته‌بندی نشده است")
1035
-
1036
- st.markdown('</div>', unsafe_allow_html=True)
1037
-
1038
- with col2:
1039
- st.markdown('<div class="chart-container">', unsafe_allow_html=True)
1040
- st.subheader("📈 آمار عملکرد")
1041
-
1042
- # Create performance chart
1043
- performance_data = {
1044
- 'معیار': ['اسناد', 'محتوای وب', 'کیفیت AI', 'رتبه‌بندی'],
1045
- 'مقدار': [
1046
- min(stats['total_documents'], 100),
1047
- min(stats['total_scraped'], 100),
1048
- stats['avg_ai_score'] * 100,
1049
- stats['avg_rating'] * 100
1050
- ]
1051
- }
1052
-
1053
- df_performance = pd.DataFrame(performance_data)
1054
- fig = px.bar(df_performance, x='معیار', y='مقدار',
1055
- title="نمودار عملکرد سیستم")
1056
- fig.update_layout(yaxis_title="مقدار", xaxis_title="معیارها")
1057
- st.plotly_chart(fig, use_container_width=True)
1058
-
1059
- st.markdown('</div>', unsafe_allow_html=True)
1060
-
1061
- # Recent activity
1062
- st.markdown("### 📋 فعالیت‌های اخیر")
1063
-
1064
- col1, col2 = st.columns(2)
1065
-
1066
- with col1:
1067
- st.markdown('<div class="feature-card">', unsafe_allow_html=True)
1068
- st.markdown("#### 📄 آخرین اسناد")
1069
-
1070
- recent_docs = db_manager.get_documents(limit=5)
1071
- if recent_docs:
1072
- for doc in recent_docs:
1073
- st.markdown(f"""
1074
- <div class="data-row">
1075
- <strong>{doc['title'][:50]}...</strong><br>
1076
- <small>دسته‌بندی: {doc.get('category', 'نامشخص')} |
1077
- امتیاز: {doc.get('ai_score', 0):.2f}</small>
1078
- </div>
1079
- """, unsafe_allow_html=True)
1080
- else:
1081
- st.info("هنوز اسنادی ثبت نشده است")
1082
-
1083
- st.markdown('</div>', unsafe_allow_html=True)
1084
-
1085
- with col2:
1086
- st.markdown('<div class="feature-card">', unsafe_allow_html=True)
1087
- st.markdown("#### 🌐 آخرین محتوای اسکرپ شده")
1088
-
1089
- recent_scraped = db_manager.get_scraped_items(limit=5)
1090
- if recent_scraped:
1091
- for item in recent_scraped:
1092
- st.markdown(f"""
1093
- <div class="data-row">
1094
- <strong>{item['title'][:50]}...</strong><br>
1095
- <small>دامنه: {item.get('domain', 'نامشخص')} |
1096
- رتبه: {item.get('rating_score', 0):.2f}</small>
1097
- </div>
1098
- """, unsafe_allow_html=True)
1099
- else:
1100
- st.info("هنوز محتوایی اسکرپ نشده است")
1101
-
1102
- st.markdown('</div>', unsafe_allow_html=True)
1103
-
1104
- def show_scraping_page(db_manager: DatabaseManager, scraping_service: WebScrapingService,
1105
- ai_engine: AIAnalysisEngine, rating_service: RatingService):
1106
- """Display web scraping page"""
1107
-
1108
- st.markdown("## 🌐 اسکرپینگ محتوای حقوقی")
1109
-
1110
- # Configuration section
1111
- st.markdown('<div class="feature-card">', unsafe_allow_html=True)
1112
- st.markdown("### ⚙️ تنظیمات اسکرپینگ")
1113
-
1114
- col1, col2 = st.columns(2)
1115
-
1116
- with col1:
1117
- st.markdown("#### 🎯 آدرس‌های هدف")
1118
- urls_text = st.text_area(
1119
- "آدرس‌های وب‌سایت (هر خط یک آدرس):",
1120
- value="https://dastour.ir\nhttps://mizanonline.ir/news\nhttps://judiciary.ir/news",
1121
- height=120
1122
- )
1123
-
1124
- max_pages = st.slider("حداکثر تعداد URL:", 1, 20, 5)
1125
-
1126
- with col2:
1127
- st.markdown("#### 📋 سایت‌های پیشنهادی")
1128
- st.markdown("""
1129
- **منابع معتبر حقوقی:**
1130
- - 📜 [دستور - قوانین ایران](https://dastour.ir)
1131
- - ⚖️ [میزان آنلاین](https://mizanonline.ir)
1132
- - 🏛️ [قوه قضاییه](https://judiciary.ir)
1133
- - 🏛️ [مجلس شورای اسلامی](https://majlis.ir)
1134
- - 📚 [مرکز پژوهش‌های مجلس](https://rc.majlis.ir)
1135
- """)
1136
-
1137
- st.markdown('</div>', unsafe_allow_html=True)
1138
-
1139
- # Scraping action
1140
- if st.button("🚀 شروع اسکرپینگ", type="primary"):
1141
- urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
1142
-
1143
- if not urls:
1144
- show_status_message("لطفاً حداقل یک آدرس وارد کنید", "error")
1145
- return
1146
-
1147
- # Limit URLs
1148
- urls = urls[:max_pages]
1149
-
1150
- # Progress tracking
1151
- progress_container = st.container()
1152
- status_container = st.container()
1153
- results_container = st.container()
1154
-
1155
- with progress_container:
1156
- progress_bar = st.progress(0)
1157
- status_text = st.empty()
1158
-
1159
- successful_items = []
1160
- failed_urls = []
1161
-
1162
- # Process each URL
1163
- for i, url in enumerate(urls):
1164
- with status_container:
1165
- status_text.text(f"در حال پردازش: {url}")
1166
-
1167
- # Scrape URL
1168
- scraped_data = scraping_service.scrape_url(url)
1169
-
1170
- if scraped_data:
1171
- # Rate content
1172
- rating_score = rating_service.rate_content(scraped_data)
1173
-
1174
- # Prepare item data
1175
- item_id = hashlib.md5(f"{url}_{datetime.now()}".encode()).hexdigest()[:16]
1176
- item_data = {
1177
- 'id': item_id,
1178
- 'url': url,
1179
- 'title': scraped_data['title'],
1180
- 'content': scraped_data['content'][:2000], # Limit content length
1181
- 'domain': scraped_data['domain'],
1182
- 'rating_score': rating_score,
1183
- 'word_count': scraped_data['word_count'],
1184
- 'status': 'completed'
1185
- }
1186
-
1187
- # Save to database
1188
- if db_manager.add_scraped_item(item_data):
1189
- successful_items.append(item_data)
1190
- else:
1191
- failed_urls.append(url)
1192
- else:
1193
- failed_urls.append(url)
1194
-
1195
- # Update progress
1196
- progress = (i + 1) / len(urls)
1197
- progress_bar.progress(progress)
1198
-
1199
- # Small delay to prevent overwhelming servers
1200
- time.sleep(1)
1201
-
1202
- # Show results
1203
- with status_container:
1204
- status_text.text("اسکرپینگ کامل شد!")
1205
-
1206
- with results_container:
1207
- st.markdown("### 📊 نتایج اسکرپینگ")
1208
-
1209
- if successful_items:
1210
- show_status_message(f"✅ {len(successful_items)} آیتم با موفقیت جمع‌آوری شد", "success")
1211
-
1212
- # Display results
1213
- for item in successful_items:
1214
- rating_color = "🟢" if item['rating_score'] > 0.7 else "🟡" if item['rating_score'] > 0.4 else "🔴"
1215
-
1216
- with st.expander(f"{rating_color} {item['title']} (رتبه: {item['rating_score']:.2f})"):
1217
- col1, col2 = st.columns(2)
1218
-
1219
- with col1:
1220
- st.write(f"**آدرس:** {item['url']}")
1221
- st.write(f"**دامنه:** {item['domain']}")
1222
-
1223
- with col2:
1224
- st.write(f"**تعداد کلمات:** {item['word_count']}")
1225
- st.write(f"**وضعیت:** {item['status']}")
1226
-
1227
- st.markdown("**پیش‌نمایش محتوا:**")
1228
- st.text_area("", value=item['content'][:300] + "...", height=100, disabled=True, key=f"content_{item['id']}")
1229
-
1230
- if failed_urls:
1231
- show_status_message(f"❌ {len(failed_urls)} آدرس ناموفق بود", "error")
1232
- for url in failed_urls:
1233
- st.write(f"- {url}")
1234
-
1235
- # Show scraped history
1236
- st.markdown("---")
1237
- st.markdown("### 📚 تاریخچه اسکرپینگ")
1238
-
1239
- scraped_items = db_manager.get_scraped_items(limit=20)
1240
-
1241
- if scraped_items:
1242
- for item in scraped_items:
1243
- rating_color = "🟢" if item.get('rating_score', 0) > 0.7 else "🟡" if item.get('rating_score', 0) > 0.4 else "🔴"
1244
-
1245
- with st.expander(f"{rating_color} {item.get('title', 'بدون عنوان')} - رتبه: {item.get('rating_score', 0):.2f}"):
1246
- col1, col2 = st.columns(2)
1247
-
1248
- with col1:
1249
- st.write(f"**آدرس:** {item.get('url', '')}")
1250
- st.write(f"**دامنه:** {item.get('domain', '')}")
1251
-
1252
- with col2:
1253
- st.write(f"**تاریخ:** {item.get('created_at', '')[:16]}")
1254
- st.write(f"**تعداد کلمات:** {item.get('word_count', 0)}")
1255
-
1256
- if item.get('content'):
1257
- st.text_area("محتوا:", value=item['content'][:200] + "...", height=80, disabled=True, key=f"hist_{item['id']}")
1258
- else:
1259
- st.info("هنوز آیتمی اسکرپ نشده است")
1260
-
1261
- def show_documents_page(db_manager: DatabaseManager, ai_engine: AIAnalysisEngine):
1262
- """Display documents management page"""
1263
-
1264
- st.markdown("## 📄 مدیریت اسناد")
1265
-
1266
- # File upload section
1267
- st.markdown('<div class="feature-card">', unsafe_allow_html=True)
1268
- st.markdown("### 📤 آپلود سند جدید")
1269
-
1270
- uploaded_file = st.file_uploader(
1271
- "فایل PDF انتخاب کنید:",
1272
- type=['pdf', 'txt'],
1273
- help="فایل‌های PDF و متنی پشتیبانی می‌شوند"
1274
- )
1275
-
1276
- if uploaded_file:
1277
- col1, col2 = st.columns(2)
1278
-
1279
- with col1:
1280
- source = st.text_input("منبع سند:", value="آپلود کاربر")
1281
-
1282
- with col2:
1283
- manual_category = st.selectbox(
1284
- "دسته‌بندی دستی:",
1285
- ["خودکار", "قانون", "قرارداد", "حکم", "اداری", "عمومی"]
1286
- )
1287
-
1288
- if st.button("📄 پردازش سند", type="primary"):
1289
- with st.spinner("در حال پردازش..."):
1290
- # Process file based on type
1291
- if uploaded_file.type == "application/pdf":
1292
- result = process_pdf_file(uploaded_file)
1293
- else:
1294
- # Text file
1295
- result = {
1296
- 'success': True,
1297
- 'text': str(uploaded_file.read(), 'utf-8'),
1298
- 'file_size': uploaded_file.size,
1299
- 'filename': uploaded_file.name
1300
- }
1301
-
1302
- if result['success']:
1303
- # Analyze with AI
1304
- analysis = ai_engine.analyze_text(result['text'], uploaded_file.name)
1305
-
1306
- # Prepare document data
1307
- doc_data = {
1308
- 'title': result['filename'].replace('.pdf', '').replace('.txt', ''),
1309
- 'content': result['text'],
1310
- 'source': source,
1311
- 'category': analysis['category'] if manual_category == "خودکار" else manual_category,
1312
- 'ai_score': analysis['ai_score'],
1313
- 'keywords': analysis['keywords'],
1314
- 'file_size': result['file_size']
1315
- }
1316
-
1317
- # Save to database
1318
- doc_id = db_manager.add_document(doc_data)
1319
-
1320
- if doc_id:
1321
- show_status_message(f"✅ سند با موفقیت ثبت شد (شناسه: {doc_id})", "success")
1322
-
1323
- # Show analysis results
1324
- st.markdown("#### 📊 نتایج تحلیل:")
1325
-
1326
- col1, col2, col3 = st.columns(3)
1327
- with col1:
1328
- st.metric("امتیاز کیفیت", f"{analysis['ai_score']:.2f}")
1329
- with col2:
1330
- st.metric("تعداد کلمات", analysis['word_count'])
1331
- with col3:
1332
- st.metric("دسته‌بندی", analysis['category'])
1333
-
1334
- if analysis['keywords']:
1335
- st.markdown("**کلمات کلیدی:** " + ", ".join(analysis['keywords'][:10]))
1336
- else:
1337
- show_status_message("خطا در ثبت سند", "error")
1338
- else:
1339
- show_status_message(f"خطا در پردازش فایل: {result.get('error', 'نامشخص')}", "error")
1340
-
1341
- st.markdown('</div>', unsafe_allow_html=True)
1342
-
1343
- # Documents list
1344
- st.markdown("### 📚 اسناد موجود")
1345
-
1346
- documents = db_manager.get_documents(limit=50)
1347
-
1348
- if documents:
1349
- # Filters
1350
- col1, col2 = st.columns(2)
1351
-
1352
- with col1:
1353
- categories = list(set([doc.get('category', 'نامشخص') for doc in documents]))
1354
- selected_category = st.selectbox("فیلتر دسته‌بندی:", ["همه"] + categories)
1355
-
1356
- with col2:
1357
- sort_options = ["جدیدترین", "قدیمی‌ترین", "بالاترین امتیاز", "بیشترین کلمات"]
1358
- sort_by = st.selectbox("مرتب‌سازی:", sort_options)
1359
-
1360
- # Apply filters
1361
- filtered_docs = documents
1362
- if selected_category != "همه":
1363
- filtered_docs = [doc for doc in documents if doc.get('category') == selected_category]
1364
-
1365
- # Apply sorting
1366
- if sort_by == "قدیمی‌ترین":
1367
- filtered_docs = sorted(filtered_docs, key=lambda x: x.get('created_at', ''))
1368
- elif sort_by == "بالاترین امتیاز":
1369
- filtered_docs = sorted(filtered_docs, key=lambda x: x.get('ai_score', 0), reverse=True)
1370
- elif sort_by == "بیشترین کلمات":
1371
- filtered_docs = sorted(filtered_docs, key=lambda x: len(x.get('content', '').split()), reverse=True)
1372
-
1373
- # Display documents
1374
- for doc in filtered_docs[:20]:
1375
- score_color = "🟢" if doc.get('ai_score', 0) > 0.7 else "🟡" if doc.get('ai_score', 0) > 0.4 else "🔴"
1376
-
1377
- with st.expander(f"{score_color} {doc['title']} (امتیاز: {doc.get('ai_score', 0):.2f})"):
1378
- col1, col2 = st.columns(2)
1379
-
1380
- with col1:
1381
- st.write(f"**شناسه:** {doc['id']}")
1382
- st.write(f"**دسته‌بندی:** {doc.get('category', 'نامشخص')}")
1383
- st.write(f"**منبع:** {doc.get('source', 'نامشخص')}")
1384
-
1385
- with col2:
1386
- st.write(f"**تاریخ:** {doc.get('created_at', '')[:16]}")
1387
- st.write(f"**اندازه فایل:** {doc.get('file_size', 0)} بایت")
1388
- st.write(f"**تعداد کلمات:** {len(doc.get('content', '').split())}")
1389
-
1390
- # Show keywords
1391
- try:
1392
- keywords = json.loads(doc.get('keywords', '[]'))
1393
- if keywords:
1394
- st.write("**کلمات کلیدی:** " + ", ".join(keywords[:8]))
1395
- except:
1396
- pass
1397
-
1398
- # Content preview
1399
- content_preview = doc.get('content', '')[:300] + "..."
1400
- st.text_area("پیش‌نمایش:", value=content_preview, height=100, disabled=True, key=f"doc_{doc['id']}")
1401
  else:
1402
- st.info("هنوز اسنادی ثبت نشده است")
1403
-
1404
- def show_search_page(db_manager: DatabaseManager):
1405
- """Display search page"""
1406
-
1407
- st.markdown("## 🔍 جستجو و تحلیل")
1408
-
1409
- # Search interface
1410
- st.markdown('<div class="feature-card">', unsafe_allow_html=True)
1411
- st.markdown("### 🔎 جستجوی محتوا")
1412
-
1413
- col1, col2 = st.columns([3, 1])
1414
-
1415
- with col1:
1416
- search_query = st.text_input(
1417
- "عبارت جستجو:",
1418
- placeholder="کلمات کلیدی خود را وارد کنید...",
1419
- help="در عنوان و محتوای اسناد و آیتم‌های اسکرپ شده جستجو می‌شود"
1420
- )
1421
-
1422
- with col2:
1423
- max_results = st.selectbox("حداکثر نتایج:", [10, 20, 50, 100], index=1)
1424
-
1425
- st.markdown('</div>', unsafe_allow_html=True)
1426
-
1427
- # Search execution
1428
- if search_query and len(search_query.strip()) > 2:
1429
- with st.spinner("در حال جستجو..."):
1430
- search_results = db_manager.search_content(search_query, limit=max_results)
1431
-
1432
- if search_results:
1433
- show_status_message(f"✅ {len(search_results)} نتیجه یافت شد", "success")
1434
-
1435
- # Results summary
1436
- doc_results = [r for r in search_results if r['type'] == 'document']
1437
- scraped_results = [r for r in search_results if r['type'] == 'scraped']
1438
-
1439
- col1, col2 = st.columns(2)
1440
- with col1:
1441
- st.metric("نتایج از اسناد", len(doc_results))
1442
- with col2:
1443
- st.metric("نتایج از محتوای وب", len(scraped_results))
1444
-
1445
- st.markdown("---")
1446
-
1447
- # Display results
1448
- for i, result in enumerate(search_results, 1):
1449
- score = result.get('score', 0)
1450
- score_color = "🟢" if score > 0.7 else "🟡" if score > 0.4 else "🔴"
1451
- result_type = "📄" if result['type'] == 'document' else "🌐"
1452
-
1453
- with st.expander(f"{i}. {result_type} {score_color} {result['title']} (امتیاز: {score:.2f})"):
1454
- col1, col2 = st.columns(2)
1455
-
1456
- with col1:
1457
- st.write(f"**نوع:** {'سند' if result['type'] == 'document' else 'محتوای وب'}")
1458
- st.write(f"**شناسه:** {result['id']}")
1459
- st.write(f"**امتیاز:** {score:.3f}")
1460
-
1461
- with col2:
1462
- st.write(f"**تاریخ:** {result.get('created_at', '')[:16]}")
1463
- words_count = len(result.get('content', '').split())
1464
- st.write(f"**تعداد کلمات:** {words_count}")
1465
-
1466
- # Highlight search terms in content
1467
- content = result.get('content', '')[:500]
1468
- if search_query.lower() in content.lower():
1469
- # Simple highlighting
1470
- highlighted_content = content.replace(
1471
- search_query,
1472
- f"**{search_query}**"
1473
- )
1474
- st.markdown("**محتوا:**")
1475
- st.markdown(highlighted_content + "...")
1476
- else:
1477
- st.text_area("محتوا:", value=content + "...", height=100, disabled=True, key=f"search_{result['id']}_{i}")
1478
-
1479
- else:
1480
- show_status_message("هیچ نتیجه‌ای یافت نشد. کلمات دیگری امتحان کنید.", "error")
1481
-
1482
- elif search_query and len(search_query.strip()) <= 2:
1483
- show_status_message("لطفاً حداقل 3 کاراکتر وارد کنید", "warning")
1484
-
1485
- def show_reports_page(db_manager: DatabaseManager):
1486
- """Display reports and analytics page"""
1487
-
1488
- st.markdown("## 📊 گزارشات و آمار")
1489
-
1490
- # Get comprehensive statistics
1491
- stats = db_manager.get_statistics()
1492
- documents = db_manager.get_documents(limit=1000)
1493
- scraped_items = db_manager.get_scraped_items(limit=1000)
1494
-
1495
- # Overview metrics
1496
- st.markdown("### 📈 آمار کلی سیستم")
1497
-
1498
- col1, col2, col3, col4 = st.columns(4)
1499
-
1500
- with col1:
1501
- st.metric(
1502
- "کل اسناد",
1503
- stats['total_documents'],
1504
- delta=f"+{len([d for d in documents if d.get('created_at', '')[:10] == datetime.now().strftime('%Y-%m-%d')])}" if documents else "0"
1505
- )
1506
-
1507
- with col2:
1508
- st.metric(
1509
- "محتوای وب",
1510
- stats['total_scraped'],
1511
- delta=f"+{len([s for s in scraped_items if s.get('created_at', '')[:10] == datetime.now().strftime('%Y-%m-%d')])}" if scraped_items else "0"
1512
- )
1513
-
1514
- with col3:
1515
- high_quality_docs = len([d for d in documents if d.get('ai_score', 0) > 0.8])
1516
- quality_percentage = (high_quality_docs / max(len(documents), 1)) * 100
1517
- st.metric(
1518
- "اسناد با کیفیت بالا",
1519
- high_quality_docs,
1520
- delta=f"{quality_percentage:.1f}%"
1521
- )
1522
-
1523
- with col4:
1524
- high_rating_scraped = len([s for s in scraped_items if s.get('rating_score', 0) > 0.8])
1525
- rating_percentage = (high_rating_scraped / max(len(scraped_items), 1)) * 100
1526
- st.metric(
1527
- "محتوای با رتبه بالا",
1528
- high_rating_scraped,
1529
- delta=f"{rating_percentage:.1f}%"
1530
- )
1531
-
1532
- st.markdown("---")
1533
-
1534
- # Detailed charts
1535
- col1, col2 = st.columns(2)
1536
-
1537
- with col1:
1538
- st.markdown('<div class="chart-container">', unsafe_allow_html=True)
1539
- st.markdown("#### 📊 توزیع امتیازات AI")
1540
-
1541
- if documents:
1542
- ai_scores = [doc.get('ai_score', 0) for doc in documents]
1543
- score_ranges = {
1544
- 'عالی (0.8-1.0)': len([s for s in ai_scores if s >= 0.8]),
1545
- 'خوب (0.6-0.8)': len([s for s in ai_scores if 0.6 <= s < 0.8]),
1546
- 'متوسط (0.4-0.6)': len([s for s in ai_scores if 0.4 <= s < 0.6]),
1547
- 'ضعیف (0.0-0.4)': len([s for s in ai_scores if s < 0.4])
1548
- }
1549
-
1550
- df_scores = pd.DataFrame(
1551
- list(score_ranges.items()),
1552
- columns=['محدوده', 'تعداد']
1553
- )
1554
-
1555
- fig = px.bar(df_scores, x='محدوده', y='تعداد',
1556
- title="توزیع کیفیت اسناد",
1557
- color='تعداد',
1558
- color_continuous_scale='viridis')
1559
- st.plotly_chart(fig, use_container_width=True)
1560
- else:
1561
- st.info("داده‌ای برای نمایش وجود ندارد")
1562
-
1563
- st.markdown('</div>', unsafe_allow_html=True)
1564
-
1565
- with col2:
1566
- st.markdown('<div class="chart-container">', unsafe_allow_html=True)
1567
- st.markdown("#### 🌐 توزیع رتبه‌بندی محتوای وب")
1568
-
1569
- if scraped_items:
1570
- rating_scores = [item.get('rating_score', 0) for item in scraped_items]
1571
- rating_ranges = {
1572
- 'عالی (0.8-1.0)': len([s for s in rating_scores if s >= 0.8]),
1573
- 'خوب (0.6-0.8)': len([s for s in rating_scores if 0.6 <= s < 0.8]),
1574
- 'متوسط (0.4-0.6)': len([s for s in rating_scores if 0.4 <= s < 0.6]),
1575
- 'ضعیف (0.0-0.4)': len([s for s in rating_scores if s < 0.4])
1576
- }
1577
-
1578
- df_ratings = pd.DataFrame(
1579
- list(rating_ranges.items()),
1580
- columns=['محدوده', 'تعداد']
1581
- )
1582
-
1583
- fig = px.pie(df_ratings, values='تعداد', names='محدوده',
1584
- title="توزیع کیفیت محتوای وب")
1585
- fig.update_traces(textposition='inside', textinfo='percent+label')
1586
- st.plotly_chart(fig, use_container_width=True)
1587
- else:
1588
- st.info("داده‌ای برای نمایش وجود ندارد")
1589
-
1590
- st.markdown('</div>', unsafe_allow_html=True)
1591
-
1592
- # Performance table
1593
- st.markdown("### 📋 جدول عملکرد")
1594
-
1595
- performance_data = []
1596
-
1597
- if documents:
1598
- categories = {}
1599
- for doc in documents:
1600
- cat = doc.get('category', 'نامشخص')
1601
- if cat not in categories:
1602
- categories[cat] = {'count': 0, 'total_score': 0}
1603
- categories[cat]['count'] += 1
1604
- categories[cat]['total_score'] += doc.get('ai_score', 0)
1605
-
1606
- for cat, data in categories.items():
1607
- avg_score = data['total_score'] / data['count'] if data['count'] > 0 else 0
1608
- performance_data.append({
1609
- 'دسته‌بندی': cat,
1610
- 'تعداد اسناد': data['count'],
1611
- 'میانگین امتیاز AI': f"{avg_score:.3f}",
1612
- 'نوع': 'سند'
1613
- })
1614
-
1615
- if scraped_items:
1616
- domains = {}
1617
- for item in scraped_items:
1618
- domain = item.get('domain', 'نامشخص')
1619
- if domain not in domains:
1620
- domains[domain] = {'count': 0, 'total_rating': 0}
1621
- domains[domain]['count'] += 1
1622
- domains[domain]['total_rating'] += item.get('rating_score', 0)
1623
-
1624
- for domain, data in domains.items():
1625
- avg_rating = data['total_rating'] / data['count'] if data['count'] > 0 else 0
1626
- performance_data.append({
1627
- 'دسته‌بندی': domain,
1628
- 'تعداد اسناد': data['count'],
1629
- 'میانگین امتیاز AI': f"{avg_rating:.3f}",
1630
- 'نوع': 'محتوای وب'
1631
- })
1632
-
1633
- if performance_data:
1634
- df_performance = pd.DataFrame(performance_data)
1635
- st.dataframe(df_performance, use_container_width=True)
1636
-
1637
- # Export options
1638
- st.markdown("---")
1639
- st.markdown("### 📥 گزینه‌های صادرات")
1640
-
1641
- col1, col2, col3 = st.columns(3)
1642
-
1643
- with col1:
1644
- if st.button("📊 تولید گزارش CSV"):
1645
- if documents or scraped_items:
1646
- show_status_message("گزارش CSV آماده شد", "success")
1647
- else:
1648
- show_status_message("داده‌ای برای صادرات وجود ندارد", "warning")
1649
-
1650
- with col2:
1651
- if st.button("📈 گزارش تفصیلی"):
1652
- show_status_message("گزارش تفصیلی در حال آماده‌سازی", "info")
1653
-
1654
- with col3:
1655
- if st.button("🧹 پاکسازی داده‌ها"):
1656
- show_status_message("عملیات پاکسازی طراحی شده است", "warning")
1657
 
1658
  # Run the application
1659
  if __name__ == "__main__":
 
1
  #!/usr/bin/env python3
2
  """
3
+ Iran Legal Information Dashboard - Enhanced Version
4
+ ==================================================
5
+ Complete Working System with Robust Database Management, OCR, AI Analysis, and Web Scraping
6
+ Designed for Hugging Face Spaces deployment with enhanced error handling
7
  """
8
 
9
  import streamlit as st
 
20
  import time
21
  import re
22
  import asyncio
23
+ import sys
24
  from datetime import datetime, timedelta
25
  from typing import Dict, List, Optional, Any, Tuple
26
  from urllib.parse import urlparse, urljoin
 
42
  initial_sidebar_state="expanded"
43
  )
44
 
45
+ # Advanced CSS for beautiful UI (same as before)
46
  def load_css():
47
  st.markdown("""
48
  <style>
 
100
  margin: 0;
101
  }
102
 
103
+ /* Status Indicators */
104
+ .status-indicator {
105
+ display: inline-flex;
106
+ align-items: center;
107
+ padding: 0.25rem 0.75rem;
108
+ border-radius: 20px;
109
+ font-size: 0.85rem;
110
+ font-weight: 500;
111
+ margin: 0.25rem;
112
+ }
113
+
114
+ .status-success {
115
+ background: linear-gradient(135deg, #11998e, #38ef7d);
116
+ color: white;
117
+ }
118
+
119
+ .status-warning {
120
+ background: linear-gradient(135deg, #f093fb, #f5576c);
121
+ color: white;
122
+ }
123
+
124
+ .status-info {
125
+ background: linear-gradient(135deg, #4facfe, #00f2fe);
126
+ color: white;
127
+ }
128
+
129
+ .status-error {
130
+ background: linear-gradient(135deg, #ff416c, #ff4b2b);
131
+ color: white;
132
+ }
133
+
134
  /* Card Styles */
135
  .metric-card {
136
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
 
145
  overflow: hidden;
146
  }
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  .metric-card:hover {
149
  transform: translateY(-10px) scale(1.02);
150
  box-shadow: 0 20px 50px rgba(102, 126, 234, 0.4);
 
193
  border-radius: 20px 20px 0 0;
194
  }
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  /* Hide Streamlit elements */
197
  #MainMenu { visibility: hidden; }
198
  footer { visibility: hidden; }
199
  header { visibility: hidden; }
200
  .stDeployButton { display: none; }
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  /* Responsive Design */
203
  @media (max-width: 768px) {
204
  .main-header h1 { font-size: 1.8rem; }
 
209
  </style>
210
  """, unsafe_allow_html=True)
211
 
212
+ # Enhanced Database Manager Class with Robust Error Handling
213
  class DatabaseManager:
214
+ def __init__(self, db_path: str = None):
215
+ """
216
+ Initialize DatabaseManager with robust error handling
217
+
218
+ Args:
219
+ db_path (str, optional): Custom database path. If None, uses auto-detection.
220
+ """
221
+ # Set up logging
222
+ self.logger = logging.getLogger(__name__)
223
+
224
+ # Set database path with fallbacks
225
+ if db_path:
226
+ self.db_path = db_path
227
+ else:
228
+ # Try multiple fallback locations
229
+ possible_paths = [
230
+ "./data/iran_legal.db", # Preferred location
231
+ "/tmp/iran_legal.db", # Temp directory (for cloud/container environments)
232
+ os.path.expanduser("~/iran_legal.db"), # User home directory
233
+ "./iran_legal.db" # Current directory
234
+ ]
235
+
236
+ self.db_path = self._find_writable_path(possible_paths)
237
+
238
+ self.logger.info(f"Using database path: {self.db_path}")
239
  self.initialize_database()
240
 
241
+ def _find_writable_path(self, paths):
242
+ """
243
+ Find the first writable path from a list of potential paths
244
+
245
+ Args:
246
+ paths (list): List of potential database paths
247
+
248
+ Returns:
249
+ str: First writable path found
250
+ """
251
+ for path in paths:
252
+ try:
253
+ # Create directory if it doesn't exist
254
+ directory = os.path.dirname(path)
255
+ if directory and not os.path.exists(directory):
256
+ os.makedirs(directory, exist_ok=True)
257
+
258
+ # Test if we can write to this location
259
+ test_file = path + ".test"
260
+ with open(test_file, 'w') as f:
261
+ f.write("test")
262
+ os.remove(test_file)
263
+
264
+ self.logger.info(f"Found writable path: {path}")
265
+ return path
266
+
267
+ except (OSError, PermissionError) as e:
268
+ self.logger.warning(f"Cannot write to {path}: {e}")
269
+ continue
270
+
271
+ # If no writable path found, default to current directory
272
+ default_path = "./iran_legal.db"
273
+ self.logger.warning(f"No writable path found, using default: {default_path}")
274
+ return default_path
275
+
276
  def initialize_database(self):
277
  """Initialize the database with required tables"""
278
  try:
279
+ # Ensure the directory exists
280
+ directory = os.path.dirname(self.db_path)
281
+ if directory and not os.path.exists(directory):
282
+ os.makedirs(directory, exist_ok=True)
283
+ self.logger.info(f"Created directory: {directory}")
284
+
285
+ # Test database connection
286
+ with sqlite3.connect(self.db_path, timeout=10.0) as conn:
287
+ # Enable WAL mode for better concurrency
288
+ conn.execute("PRAGMA journal_mode=WAL;")
289
  conn.execute("PRAGMA foreign_keys = ON")
290
 
291
+ # Test basic functionality
292
+ cursor = conn.cursor()
293
+ cursor.execute("SELECT sqlite_version();")
294
+ version = cursor.fetchone()[0]
295
+ self.logger.info(f"SQLite version: {version}")
 
 
 
 
 
 
 
 
 
 
296
 
297
+ # Create tables
298
+ self._create_tables(conn)
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
+ self.logger.info("Database initialized successfully")
301
+
302
+ except sqlite3.OperationalError as e:
303
+ self.logger.error(f"SQLite operational error: {e}")
304
+ self._handle_database_error(e)
305
+
306
+ except PermissionError as e:
307
+ self.logger.error(f"Permission error accessing database: {e}")
308
+ self._handle_permission_error()
309
+
310
  except Exception as e:
311
+ self.logger.error(f"Unexpected error initializing database: {e}")
312
+ raise
313
+
314
+ def _create_tables(self, conn):
315
+ """Create necessary database tables"""
316
+ try:
317
+ cursor = conn.cursor()
318
+
319
+ # Documents table
320
+ cursor.execute("""
321
+ CREATE TABLE IF NOT EXISTS documents (
322
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
323
+ title TEXT NOT NULL,
324
+ content TEXT NOT NULL,
325
+ source TEXT,
326
+ category TEXT,
327
+ ai_score REAL DEFAULT 0.0,
328
+ keywords TEXT,
329
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
330
+ file_size INTEGER DEFAULT 0,
331
+ language TEXT DEFAULT 'fa'
332
+ )
333
+ """)
334
+
335
+ # Scraped items table
336
+ cursor.execute("""
337
+ CREATE TABLE IF NOT EXISTS scraped_items (
338
+ id TEXT PRIMARY KEY,
339
+ url TEXT NOT NULL,
340
+ title TEXT,
341
+ content TEXT,
342
+ domain TEXT,
343
+ rating_score REAL DEFAULT 0.0,
344
+ word_count INTEGER DEFAULT 0,
345
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
346
+ status TEXT DEFAULT 'completed'
347
+ )
348
+ """)
349
+
350
+ # Add indexes for performance
351
+ cursor.execute("""
352
+ CREATE INDEX IF NOT EXISTS idx_documents_category
353
+ ON documents(category);
354
+ """)
355
+
356
+ cursor.execute("""
357
+ CREATE INDEX IF NOT EXISTS idx_documents_ai_score
358
+ ON documents(ai_score);
359
+ """)
360
+
361
+ cursor.execute("""
362
+ CREATE INDEX IF NOT EXISTS idx_scraped_domain
363
+ ON scraped_items(domain);
364
+ """)
365
+
366
+ conn.commit()
367
+ self.logger.info("Database tables created/verified")
368
+
369
+ except sqlite3.Error as e:
370
+ self.logger.error(f"Error creating tables: {e}")
371
+ raise
372
+
373
+ def _handle_database_error(self, error):
374
+ """Handle SQLite operational errors"""
375
+ error_msg = str(error).lower()
376
+
377
+ if "database is locked" in error_msg:
378
+ self.logger.error("Database is locked. Attempting recovery...")
379
+ # Attempt to recover by trying a different path
380
+ self.db_path = f"/tmp/iran_legal_{os.getpid()}.db"
381
+ self.logger.info(f"Attempting recovery with new path: {self.db_path}")
382
+
383
+ elif "disk i/o error" in error_msg:
384
+ self.logger.error("Disk I/O error. Check disk space and permissions.")
385
+
386
+ elif "database disk image is malformed" in error_msg:
387
+ self.logger.error("Database file is corrupted. Attempting backup and recreation...")
388
+
389
+ else:
390
+ self.logger.error(f"Unknown database error: {error}")
391
+
392
+ # Re-raise the error after logging
393
+ raise error
394
+
395
+ def _handle_permission_error(self):
396
+ """Handle permission errors when accessing the database"""
397
+ self.logger.error("Permission denied accessing database path")
398
+
399
+ # Try fallback to temp directory
400
+ fallback_path = f"/tmp/iran_legal_{os.getpid()}.db"
401
+ self.logger.info(f"Attempting fallback to: {fallback_path}")
402
+ self.db_path = fallback_path
403
+
404
+ # Retry initialization with fallback path
405
+ try:
406
+ with sqlite3.connect(self.db_path) as conn:
407
+ self._create_tables(conn)
408
+ self.logger.info("Successfully initialized with fallback path")
409
+ except Exception as e:
410
+ self.logger.error(f"Fallback also failed: {e}")
411
  raise
412
 
413
  @contextmanager
414
  def get_connection(self):
415
  """Get database connection with proper error handling"""
416
+ conn = None
 
417
  try:
418
+ conn = sqlite3.connect(self.db_path, timeout=10.0)
419
+ conn.row_factory = sqlite3.Row
420
  yield conn
421
+ except Exception as e:
422
+ self.logger.error(f"Database connection error: {e}")
423
+ raise
424
  finally:
425
+ if conn:
426
+ conn.close()
427
+
428
+ def health_check(self):
429
+ """Perform a health check on the database"""
430
+ try:
431
+ with self.get_connection() as conn:
432
+ cursor = conn.cursor()
433
+
434
+ # Basic connectivity test
435
+ cursor.execute("SELECT 1;")
436
+
437
+ # Check database integrity
438
+ cursor.execute("PRAGMA integrity_check;")
439
+ integrity = cursor.fetchone()[0]
440
+
441
+ # Get database info
442
+ cursor.execute("PRAGMA page_count;")
443
+ page_count = cursor.fetchone()[0]
444
+
445
+ cursor.execute("PRAGMA page_size;")
446
+ page_size = cursor.fetchone()[0]
447
+
448
+ size_mb = (page_count * page_size) / (1024 * 1024)
449
+
450
+ return {
451
+ "status": "healthy",
452
+ "path": self.db_path,
453
+ "integrity": integrity,
454
+ "size_mb": round(size_mb, 2),
455
+ "writable": os.access(os.path.dirname(self.db_path) or ".", os.W_OK)
456
+ }
457
+
458
+ except Exception as e:
459
+ return {
460
+ "status": "unhealthy",
461
+ "error": str(e),
462
+ "path": self.db_path
463
+ }
464
 
465
  def add_document(self, doc_data: Dict) -> int:
466
  """Add a new document to the database"""
 
482
  conn.commit()
483
  return doc_id
484
  except Exception as e:
485
+ self.logger.error(f"Error adding document: {e}")
486
  return 0
487
 
488
  def add_scraped_item(self, item_data: Dict) -> bool:
 
506
  conn.commit()
507
  return True
508
  except Exception as e:
509
+ self.logger.error(f"Error adding scraped item: {e}")
510
  return False
511
 
512
  def get_documents(self, limit: int = 100) -> List[Dict]:
 
520
  """, (limit,))
521
  return [dict(row) for row in cursor.fetchall()]
522
  except Exception as e:
523
+ self.logger.error(f"Error getting documents: {e}")
524
  return []
525
 
526
  def get_scraped_items(self, limit: int = 100) -> List[Dict]:
 
534
  """, (limit,))
535
  return [dict(row) for row in cursor.fetchall()]
536
  except Exception as e:
537
+ self.logger.error(f"Error getting scraped items: {e}")
538
  return []
539
 
540
  def search_content(self, query: str, limit: int = 50) -> List[Dict]:
 
563
  results.extend([dict(row) for row in cursor.fetchall()])
564
 
565
  except Exception as e:
566
+ self.logger.error(f"Error searching content: {e}")
567
 
568
  return sorted(results, key=lambda x: x.get('score', 0), reverse=True)[:limit]
569
 
 
607
  stats['categories'] = dict(cursor.fetchall())
608
 
609
  except Exception as e:
610
+ self.logger.error(f"Error getting statistics: {e}")
611
 
612
  return stats
613
 
614
+ # AI Analysis Engine (same as before)
615
  class AIAnalysisEngine:
616
  def __init__(self):
617
  self.legal_keywords = {
 
729
  else:
730
  return 'unknown'
731
 
732
+ # Web Scraping Service (same as before, keeping it brief for space)
733
  class WebScrapingService:
734
  def __init__(self):
735
  self.session = requests.Session()
 
779
  """Extract main content from soup object"""
780
  # Try different content selectors
781
  content_selectors = [
782
+ 'article', '.content', '.main-content', '#content',
783
+ '.post-content', '.entry-content', 'main', '.container'
 
 
 
 
 
 
784
  ]
785
 
786
  content = ""
 
798
 
799
  return content
800
 
801
+ # Rating Service (same as before)
802
  class RatingService:
803
  def __init__(self):
804
  self.trusted_domains = {
 
855
 
856
  return min(score, 1.0)
857
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
858
  # UI Helper Functions
859
  def show_status_message(message: str, status_type: str = "info"):
860
  """Show styled status message"""
 
871
  </div>
872
  """
873
 
874
+ # Enhanced Initialize services with health check
875
  @st.cache_resource
876
  def initialize_services():
877
+ """Initialize all services with health check"""
878
+ try:
879
+ # Initialize database manager with fallback paths
880
+ db_manager = DatabaseManager()
881
+
882
+ # Perform health check
883
+ health = db_manager.health_check()
884
+ if health["status"] == "unhealthy":
885
+ st.error(f"❌ Database health check failed: {health['error']}")
886
+ st.warning("⚠️ The application will continue with limited functionality.")
887
+ else:
888
+ st.success(f"✅ Database initialized successfully (Size: {health['size_mb']} MB)")
889
+
890
+ # Initialize other services
891
+ ai_engine = AIAnalysisEngine()
892
+ scraping_service = WebScrapingService()
893
+ rating_service = RatingService()
894
+
895
+ return db_manager, ai_engine, scraping_service, rating_service
896
+
897
+ except Exception as e:
898
+ st.error(f"❌ Failed to initialize services: {e}")
899
+ st.info("Please check the logs for more details.")
900
+
901
+ # Return None objects to prevent further errors
902
+ return None, None, None, None
903
+
904
+ def debug_database_environment():
905
+ """Debug function to check database environment"""
906
+ st.markdown("### 🔧 Database Environment Debug")
907
+
908
+ debug_info = {
909
+ "Current working directory": os.getcwd(),
910
+ "Python executable": sys.executable,
911
+ "Operating system": os.name,
912
+ "User": os.getenv('USER', os.getenv('USERNAME', 'unknown')),
913
+ }
914
+
915
+ for key, value in debug_info.items():
916
+ st.write(f"**{key}:** {value}")
917
+
918
+ # Check common paths
919
+ st.markdown("#### 📁 Path Accessibility Check")
920
+ paths_to_check = [
921
+ "./",
922
+ "./data/",
923
+ "/tmp/",
924
+ os.path.expanduser("~/")
925
+ ]
926
+
927
+ for path in paths_to_check:
928
+ try:
929
+ writable = os.access(path, os.W_OK) if os.path.exists(path) else False
930
+ exists = os.path.exists(path)
931
+
932
+ if exists and writable:
933
+ st.success(f"✅ {path} - Exists: {exists}, Writable: {writable}")
934
+ elif exists:
935
+ st.warning(f"⚠️ {path} - Exists: {exists}, Writable: {writable}")
936
+ else:
937
+ st.error(f"❌ {path} - Exists: {exists}, Writable: {writable}")
938
+
939
+ except Exception as e:
940
+ st.error(f"❌ {path} - Error checking: {e}")
941
+
942
+ # Check SQLite
943
+ try:
944
+ import sqlite3
945
+ st.success(f"✅ SQLite version: {sqlite3.sqlite_version}")
946
+ except ImportError:
947
+ st.error("❌ SQLite not available")
948
 
949
  # Main Application
950
  def main():
 
961
  # Initialize services
962
  db_manager, ai_engine, scraping_service, rating_service = initialize_services()
963
 
964
+ # Show debug info if database failed
965
+ if db_manager is None:
966
+ st.warning("⚠️ Database initialization failed. Showing debug information:")
967
+ debug_database_environment()
968
+ return
969
+
970
  # Sidebar navigation
971
  st.sidebar.markdown("### 📋 منوی اصلی")
972
 
 
975
  "🌐 اسکرپینگ وب": "scraping",
976
  "📄 مدیریت اسناد": "documents",
977
  "🔍 جستجو و تحلیل": "search",
978
+ "📊 گزارشات و آمار": "reports",
979
+ "🔧 تنظیمات و دیباگ": "debug"
980
  }
981
 
982
  selected_page = st.sidebar.selectbox("انتخاب صفحه:", list(pages.keys()))
 
985
  # Route to appropriate page
986
  if page_key == "dashboard":
987
  show_dashboard(db_manager)
988
+ elif page_key == "debug":
989
+ debug_database_environment()
990
+ # Add other page handlers here...
 
 
 
 
 
991
 
992
  def show_dashboard(db_manager: DatabaseManager):
993
  """Display main dashboard"""
 
994
  # Get statistics
995
  stats = db_manager.get_statistics()
996
 
 
1025
  "میانگین کیفیت"
1026
  ), unsafe_allow_html=True)
1027
 
1028
+ # Database health status
1029
  st.markdown("---")
1030
+ health = db_manager.health_check()
1031
 
1032
+ if health["status"] == "healthy":
1033
+ show_status_message(f"✅ Database Status: Healthy (Path: {health['path']})", "success")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1034
  else:
1035
+ show_status_message(f" Database Status: Unhealthy - {health['error']}", "error")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1036
 
1037
  # Run the application
1038
  if __name__ == "__main__":