google-labs-jules[bot] commited on
Commit
f4c4bbd
·
1 Parent(s): 5caa7cf

Fix SyntaxError in app.py and 2app.py

Browse files
Files changed (8) hide show
  1. .gitattributes +35 -0
  2. 2app.py +449 -0
  3. Dockerfile +25 -0
  4. README.md +12 -0
  5. app.py +960 -0
  6. huggingface.yml +3 -0
  7. nltk_setup.py +53 -0
  8. requirements.txt +17 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
2app.py ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import platform
2
+ import os
3
+ import sqlite3
4
+ import uuid
5
+ import datetime
6
+ import shutil
7
+ import traceback
8
+ import logging
9
+ from pathlib import Path
10
+ from abc import ABC, abstractmethod
11
+ from typing import Dict, Any, List
12
+ import gradio as gr
13
+ import pandas as pd
14
+
15
+ # --- Base Classes ---
16
+ class Interface(ABC):
17
+ @abstractmethod
18
+ def launch(self):
19
+ pass
20
+
21
+ class Command(ABC):
22
+ @abstractmethod
23
+ def execute(self):
24
+ pass
25
+
26
+ # --- Database Manager Implementation ---
27
+ class DatabaseManager:
28
+ """Handles all database operations including creation, connection, and CRUD operations."""
29
+ def __init__(self, db_path: str = None):
30
+ if db_path is None:
31
+ if platform.system() == 'Windows':
32
+ base_dir = os.path.join(os.environ['APPDATA'], 'FileStorageApp')
33
+ elif platform.system() == 'Darwin':
34
+ base_dir = os.path.join(os.path.expanduser('~'), 'Library', 'Application Support', 'FileStorageApp')
35
+ else:
36
+ base_dir = os.path.join(os.path.expanduser('~'), '.filestorage')
37
+
38
+ os.makedirs(base_dir, exist_ok=True)
39
+ self.db_path = os.path.join(base_dir, 'file_storage.db')
40
+ else:
41
+ self.db_path = db_path
42
+
43
+ self.conn = None
44
+ self.cursor = None
45
+ self.connect()
46
+ self.create_tables()
47
+
48
+ def connect(self) -> None:
49
+ """Establish a connection to the SQLite database."""
50
+ try:
51
+ self.conn = sqlite3.connect(self.db_path)
52
+ self.conn.execute("PRAGMA foreign_keys = ON")
53
+ self.cursor = self.conn.cursor()
54
+ except sqlite3.Error as e:
55
+ logging.error(f"Database connection error: {e}")
56
+ raise
57
+
58
+ def create_tables(self) -> None:
59
+ """Create necessary tables if they don't exist."""
60
+ tables = [
61
+ '''CREATE TABLE IF NOT EXISTS files (
62
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
63
+ filename TEXT NOT NULL,
64
+ original_filename TEXT NOT NULL,
65
+ file_path TEXT NOT NULL,
66
+ file_size INTEGER NOT NULL,
67
+ file_type TEXT,
68
+ upload_date DATETIME DEFAULT CURRENT_TIMESTAMP
69
+ )''',
70
+ '''CREATE TABLE IF NOT EXISTS metadata (
71
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
72
+ file_id INTEGER NOT NULL,
73
+ key TEXT NOT NULL,
74
+ value TEXT,
75
+ FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE
76
+ )''',
77
+ '''CREATE TABLE IF NOT EXISTS chunks (
78
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
79
+ file_id INTEGER NOT NULL,
80
+ chunk_index INTEGER NOT NULL,
81
+ chunk_text TEXT NOT NULL,
82
+ chunk_size INTEGER NOT NULL,
83
+ FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE
84
+ )'''
85
+ ]
86
+
87
+ try:
88
+ for table in tables:
89
+ self.cursor.execute(table)
90
+ self.conn.commit()
91
+ except sqlite3.Error as e:
92
+ self.conn.rollback()
93
+ logging.error(f"Error creating tables: {e}")
94
+ raise
95
+
96
+ def insert_file(self, file_data: Dict[str, Any]) -> int:
97
+ """Insert file information into the database."""
98
+ try:
99
+ self.cursor.execute('''
100
+ INSERT INTO files (filename, original_filename, file_path, file_size, file_type)
101
+ VALUES (?, ?, ?, ?, ?)
102
+ ''', (file_data['filename'], file_data['original_filename'],
103
+ file_data['file_path'], file_data['file_size'], file_data['file_type']))
104
+ self.conn.commit()
105
+ return self.cursor.lastrowid
106
+ except sqlite3.Error as e:
107
+ self.conn.rollback()
108
+ logging.error(f"Error inserting file: {e}")
109
+ raise
110
+
111
+ def insert_metadata(self, file_id: int, metadata: Dict[str, str]) -> None:
112
+ """Insert metadata for a specific file."""
113
+ try:
114
+ for key, value in metadata.items():
115
+ self.cursor.execute('''
116
+ INSERT INTO metadata (file_id, key, value)
117
+ VALUES (?, ?, ?)
118
+ ''', (file_id, key, value))
119
+ self.conn.commit()
120
+ except sqlite3.Error as e:
121
+ self.conn.rollback()
122
+ logging.error(f"Error inserting metadata: {e}")
123
+ raise
124
+
125
+ def insert_chunk(self, file_id: int, chunk_index: int, chunk_text: str) -> None:
126
+ """Insert a text chunk into the database."""
127
+ try:
128
+ chunk_size = len(chunk_text.split())
129
+ self.cursor.execute('''
130
+ INSERT INTO chunks (file_id, chunk_index, chunk_text, chunk_size)
131
+ VALUES (?, ?, ?, ?)
132
+ ''', (file_id, chunk_index, chunk_text, chunk_size))
133
+ self.conn.commit()
134
+ except sqlite3.Error as e:
135
+ self.conn.rollback()
136
+ logging.error(f"Error inserting chunk: {e}")
137
+ raise
138
+
139
+ def log_error(self, error_data: Dict[str, str]) -> None:
140
+ """Log errors to the database."""
141
+ try:
142
+ self.cursor.execute('''
143
+ INSERT INTO metadata (file_id, key, value)
144
+ VALUES (?, ?, ?)
145
+ ''', (-1, 'error', str(error_data)))
146
+ self.conn.commit()
147
+ except sqlite3.Error as e:
148
+ logging.error(f"Error logging error: {e}")
149
+
150
+ def close(self) -> None:
151
+ """Close the database connection."""
152
+ if self.conn:
153
+ self.conn.close()
154
+
155
+ # --- File Processor Implementation ---
156
+ class FileProcessor:
157
+ """Handles file uploads, storage, and metadata extraction."""
158
+ def __init__(self, upload_folder: str = None):
159
+ self.upload_folder = upload_folder or os.path.join(Path.home(), 'FileUploads')
160
+ os.makedirs(self.upload_folder, exist_ok=True)
161
+
162
+ def save_file(self, file: Any) -> Dict[str, Any]:
163
+ """Save the uploaded file and extract metadata."""
164
+ filename = f"{uuid.uuid4()}_{file.name}"
165
+ file_path = os.path.join(self.upload_folder, filename)
166
+
167
+ try:
168
+ with open(file_path, "wb") as f:
169
+ f.write(file.read())
170
+ return {
171
+ 'filename': filename,
172
+ 'original_filename': file.name,
173
+ 'file_path': file_path,
174
+ 'file_size': os.path.getsize(file_path),
175
+ 'file_type': file.name.split('.')[-1] if '.' in file.name else 'unknown'
176
+ }
177
+ except Exception as e:
178
+ logging.error(f"Error saving file: {e}")
179
+ raise
180
+
181
+ def extract_content(self, file_path: str) -> str:
182
+ """Extract text content from a file."""
183
+ try:
184
+ with open(file_path, 'r', encoding='utf-8') as f:
185
+ return f.read()
186
+ except Exception as e:
187
+ logging.error(f"Error extracting content: {e}")
188
+ raise
189
+
190
+ # --- Text Chunker Implementation ---
191
+ class TextChunker:
192
+ """Splits text content into manageable chunks."""
193
+ def __init__(self, chunk_size: int = 500, overlap: int = 50):
194
+ self.chunk_size = chunk_size
195
+ self.overlap = overlap
196
+
197
+ def chunk_text(self, text: str) -> List[str]:
198
+ """Split text into chunks with overlap."""
199
+ words = text.split()
200
+ chunks = []
201
+ start = 0
202
+
203
+ while start < len(words):
204
+ end = start + self.chunk_size
205
+ chunks.append(' '.join(words[start:end]))
206
+ start = end - self.overlap
207
+
208
+ return chunks
209
+
210
+ # --- Command Handler Implementation ---
211
+ class CommandHandler:
212
+ """Manages command execution."""
213
+ def __init__(self):
214
+ self.commands = {}
215
+
216
+ def register_command(self, name: str, command: Command):
217
+ self.commands[name] = command
218
+
219
+ def execute_command(self, name: str) -> bool:
220
+ if name in self.commands:
221
+ self.commands[name].execute()
222
+ return True
223
+ logging.warning(f"Command '{name}' not found.")
224
+ return False
225
+
226
+ # --- Main Application Implementation ---
227
+ class Application(Interface):
228
+ """Core application class."""
229
+ def __init__(self):
230
+ self.db_manager = DatabaseManager()
231
+ self.file_processor = FileProcessor()
232
+ self.text_chunker = TextChunker(chunk_size=512, overlap=50)
233
+ self.command_handler = CommandHandler()
234
+ self.processed_data = None
235
+
236
+ def run(self, uploaded_file: Any) -> None:
237
+ """Main processing pipeline."""
238
+ try:
239
+ if not uploaded_file:
240
+ raise ValueError("No file provided for processing")
241
+
242
+ # Process file
243
+ file_info = self.file_processor.save_file(uploaded_file)
244
+ file_id = self.db_manager.insert_file(file_info)
245
+
246
+ # Extract and chunk content
247
+ raw_content = self.file_processor.extract_content(file_info['file_path'])
248
+ chunks = self.text_chunker.chunk_text(raw_content)
249
+
250
+ # Store chunks and metadata
251
+ self.db_manager.insert_metadata(file_id, {
252
+ 'source': 'upload',
253
+ 'processed_at': datetime.datetime.now().isoformat()
254
+ })
255
+
256
+ for idx, chunk in enumerate(chunks):
257
+ self.db_manager.insert_chunk(file_id, idx+1, chunk)
258
+
259
+ self.processed_data = {
260
+ 'filename': uploaded_file.name,
261
+ 'chunk_count': len(chunks),
262
+ 'status': 'processed'
263
+ }
264
+
265
+ except Exception as e:
266
+ self._handle_error(e)
267
+ raise
268
+
269
+ def _handle_error(self, error: Exception) -> None:
270
+ """Centralized error handling."""
271
+ error_data = {
272
+ 'timestamp': datetime.datetime.now().isoformat(),
273
+ 'error_type': type(error).__name__,
274
+ 'message': str(error),
275
+ 'stack_trace': traceback.format_exc()
276
+ }
277
+ self.db_manager.log_error(error_data)
278
+ self.processed_data = {'status': 'failed'}
279
+
280
+ # --- Gradio Interface Implementation ---
281
+ class DataDeityInterface:
282
+ def __init__(self, app):
283
+ self.app = app
284
+ self._setup_theme()
285
+
286
+ def _setup_theme(self):
287
+ self.theme = gr.themes.Default(
288
+ primary_hue="emerald",
289
+ secondary_hue="teal",
290
+ font=[gr.themes.GoogleFont("Fira Code"), "Arial", "sans-serif"]
291
+ )
292
+
293
+ def _file_upload_tab(self):
294
+ with gr.Tab("📤 Upload & Process"):
295
+ with gr.Row():
296
+ file_input = gr.File(label="Drag files here", file_count="multiple")
297
+ stats_output = gr.JSON(label="Processing Stats")
298
+
299
+ with gr.Row():
300
+ process_btn = gr.Button("⚡ Process Files", variant="primary")
301
+ clear_btn = gr.Button("🧹 Clear Cache")
302
+
303
+ file_output = gr.Dataframe(label="File Contents Preview")
304
+
305
+ process_btn.click(
306
+ self.process_file,
307
+ inputs=file_input,
308
+ outputs=[stats_output, file_output]
309
+ )
310
+ clear_btn.click(lambda: None, outputs=[file_input, stats_output, file_output])
311
+
312
+ return file_input
313
+
314
+ def _data_explorer_tab(self):
315
+ with gr.Tab("🔍 Data Explorer"):
316
+ with gr.Row():
317
+ refresh_btn = gr.Button("🔄 Refresh Data", variant="secondary")
318
+ search_bar = gr.Textbox(placeholder="Search across all data...")
319
+
320
+ with gr.Tabs():
321
+ with gr.Tab("Database View"):
322
+ files_table = gr.Dataframe(label="Stored Files")
323
+ metadata_table = gr.Dataframe(label="File Metadata")
324
+ chunks_table = gr.Dataframe(label="Text Chunks")
325
+
326
+ with gr.Tab("Analytics View"):
327
+ stats_plot = gr.Plot(label="Data Distribution")
328
+ correlations = gr.Matrix(label="Data Correlations")
329
+
330
+ refresh_btn.click(
331
+ self.refresh_data,
332
+ outputs=[files_table, metadata_table, chunks_table]
333
+ )
334
+
335
+ def _command_interface_tab(self):
336
+ with gr.Tab("💻 Command Console"):
337
+ cmd_input = gr.Textbox(
338
+ placeholder="Enter data command...",
339
+ lines=3,
340
+ max_lines=10
341
+ )
342
+
343
+ with gr.Row():
344
+ execute_btn = gr.Button("🚀 Execute", variant="primary")
345
+ cmd_history_btn = gr.Button("🕒 History")
346
+
347
+ cmd_output = gr.JSON(label="Command Results")
348
+ cmd_explain = gr.Markdown("### Command Explanation")
349
+
350
+ execute_btn.click(
351
+ self.execute_command,
352
+ inputs=cmd_input,
353
+ outputs=[cmd_output, cmd_explain]
354
+ )
355
+
356
+ def create_interface(self):
357
+ with gr.Blocks(theme=self.theme, title="Data Deity") as interface:
358
+ gr.Markdown("# 🧠 Data Deity - Ultimate Data Omnipotence Interface")
359
+
360
+ with gr.Tabs():
361
+ file_input = self._file_upload_tab()
362
+ self._data_explorer_tab()
363
+ self._command_interface_tab()
364
+
365
+ return interface
366
+
367
+ def process_file(self, files):
368
+ try:
369
+ processed_files = []
370
+ for file in files:
371
+ self.app.run(file)
372
+ processed_files.append({
373
+ "filename": file.name,
374
+ "chunks": self.app.processed_data['chunk_count'],
375
+ "status": "processed",
376
+ "timestamp": datetime.datetime.now().isoformat()
377
+ })
378
+
379
+ stats = {
380
+ "total_files": len(processed_files),
381
+ "total_chunks": sum(f['chunks'] for f in processed_files),
382
+ "average_size": f"{sum(f.size for f in files)/1024/1024:.2f}MB"
383
+ }
384
+
385
+ preview = pd.DataFrame({
386
+ "File": [f.name for f in files],
387
+ "Type": [f.name.split('.')[-1] for f in files],
388
+ "Status": ["✅ Processed"]*len(files)
389
+ })
390
+
391
+ return stats, preview
392
+
393
+ except Exception as e:
394
+ return {"error": str(e)}, pd.DataFrame()
395
+
396
+ def refresh_data(self):
397
+ try:
398
+ files = self.app.db_manager.cursor.execute("SELECT * FROM files").fetchall()
399
+ metadata = self.app.db_manager.cursor.execute("SELECT * FROM metadata").fetchall()
400
+ chunks = self.app.db_manager.cursor.execute("SELECT * FROM chunks").fetchall()
401
+
402
+ files_df = pd.DataFrame(files, columns=["ID", "Filename", "Original", "Path", "Size", "Type", "Uploaded"])
403
+ metadata_df = pd.DataFrame(metadata, columns=["ID", "File ID", "Key", "Value"])
404
+ chunks_df = pd.DataFrame(chunks, columns=["ID", "File ID", "Index", "Text", "Size"])
405
+
406
+ return files_df, metadata_df, chunks_df
407
+
408
+ except Exception as e:
409
+ return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
410
+
411
+ def execute_command(self, command):
412
+ try:
413
+ if "list files" in command.lower():
414
+ files = self.app.db_manager.cursor.execute("SELECT filename, file_type, upload_date FROM files").fetchall()
415
+ return {"result": files}, "### File Listing Command\nRetrieved all stored files from database."
416
+
417
+ elif "search" in command.lower():
418
+ term = command.split("search")[1].strip()
419
+ results = self.app.db_manager.cursor.execute(
420
+ "SELECT chunk_text FROM chunks WHERE chunk_text LIKE ?",
421
+ (f"%{term}%",)
422
+ ).fetchall()
423
+ return {"matches": [r[0] for r in results]}, f"### Search Results\nFound {len(results)} matches for '{term}'"
424
+
425
+ else:
426
+ return {"error": "Command not recognized"}, "### Unrecognized Command\nTry 'list files' or 'search <term>'"
427
+
428
+ except Exception as e:
429
+ return {"error": str(e)}, "### Command Execution Failed"
430
+
431
+ # --- Main Execution ---
432
+ if __name__ == "__main__":
433
+ logging.basicConfig(
434
+ level=logging.INFO,
435
+ format='%(asctime)s - %(levelname)s - %(message)s'
436
+ )
437
+
438
+ try:
439
+ app = Application()
440
+ interface = DataDeityInterface(app)
441
+ interface.create_interface().launch(
442
+ server_name="0.0.0.0",
443
+ server_port=7860,
444
+ share=True
445
+ )
446
+ except KeyboardInterrupt:
447
+ logging.info("\nApplication shutdown requested")
448
+ finally:
449
+ app.db_manager.close()
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python image from the Docker Hub
2
+ FROM python:3.9-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR ./app
6
+
7
+ # Copy the requirements.txt file into the container
8
+ COPY requirements.txt .
9
+
10
+ RUN mkdir -p /home/user/nltk_data && chmod a+rwx /home/user/nltk_data
11
+
12
+ # Install the required packages
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Install additional packages if needed
16
+ RUN pip install matplotlib
17
+
18
+ # Copy the rest of your application code into the container
19
+ COPY . .
20
+
21
+ # Download NLTK resources
22
+ RUN python -m nltk.downloader punkt vader_lexicon stopwords
23
+
24
+ # Command to run your application
25
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Dbgod
3
+ emoji: 🌍
4
+ colorFrom: red
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 5.32.0
8
+ app_file: app.py
9
+ pinned: true
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,960 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import os
4
+ import json
5
+ import sqlite3
6
+ import tempfile
7
+ import nltk
8
+ import traceback
9
+ import datetime
10
+ import time
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+ import io
14
+ import base64
15
+ import requests
16
+ import re
17
+ from pathlib import Path
18
+ from nltk.sentiment import SentimentIntensityAnalyzer
19
+ from nltk.tokenize import word_tokenize
20
+ from nltk.corpus import stopwords
21
+ from sklearn.model_selection import train_test_split
22
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
23
+ from sklearn.linear_model import LinearRegression
24
+ from sklearn.cluster import KMeans
25
+ from sklearn.preprocessing import StandardScaler
26
+ from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
27
+ from sklearn.feature_extraction.text import TfidfVectorizer
28
+ import pymongo
29
+ import redis
30
+ import pymysql # Using pymysql instead of mysql.connector
31
+ import psycopg2
32
+ from bs4 import BeautifulSoup
33
+
34
+ def setup_nltk():
35
+ try:
36
+ # Use a temporary directory for NLTK data
37
+ nltk_data_dir = os.path.join(tempfile.gettempdir(), 'nltk_data')
38
+ os.makedirs(nltk_data_dir, exist_ok=True)
39
+ nltk.data.path.append(nltk_data_dir)
40
+
41
+ # Download necessary NLTK data
42
+ nltk_resources = ['punkt', 'stopwords', 'vader_lexicon']
43
+ for resource in nltk_resources:
44
+ try:
45
+ nltk.data.find(f'tokenizers/{resource}' if resource == 'punkt'
46
+ else f'corpora/{resource}' if resource == 'stopwords'
47
+ else f'sentiment/{resource}')
48
+ except LookupError:
49
+ nltk.download(resource, download_dir=nltk_data_dir, quiet=True)
50
+ return True
51
+ except Exception as e:
52
+ print(f"Error setting up NLTK: {e}")
53
+ return False
54
+
55
+ # Initialize NLTK
56
+ if not setup_nltk():
57
+ print("Failed to set up NLTK. Some NLP features may not work properly.")
58
+
59
+ class DatabaseManager:
60
+ def __init__(self, db_path=None):
61
+ try:
62
+ # Use a temporary directory for the database
63
+ if db_path is None:
64
+ db_dir = os.path.join(tempfile.gettempdir(), 'data')
65
+ os.makedirs(db_dir, exist_ok=True)
66
+ db_path = os.path.join(db_dir, 'data_deity.db')
67
+
68
+ self.db_path = db_path
69
+ self.connection = sqlite3.connect(db_path)
70
+ self.cursor = self.connection.cursor()
71
+ self._create_tables()
72
+ print(f"Successfully initialized database at {db_path}")
73
+ except sqlite3.Error as e:
74
+ print(f"Failed to initialize database: {e}")
75
+ # Fallback to in-memory database if file-based DB fails
76
+ try:
77
+ print("Trying in-memory database as fallback...")
78
+ self.db_path = ":memory:"
79
+ self.connection = sqlite3.connect(":memory:")
80
+ self.cursor = self.connection.cursor()
81
+ self._create_tables()
82
+ print("Successfully initialized in-memory database")
83
+ except sqlite3.Error as e2:
84
+ print(f"Failed to initialize in-memory database: {e2}")
85
+ raise
86
+
87
+ def _create_tables(self):
88
+ try:
89
+ self.cursor.execute('''
90
+ CREATE TABLE IF NOT EXISTS files (
91
+ id INTEGER PRIMARY KEY,
92
+ filename TEXT,
93
+ original TEXT,
94
+ path TEXT,
95
+ size INTEGER,
96
+ file_type TEXT,
97
+ upload_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
98
+ )
99
+ ''')
100
+ self.cursor.execute('''
101
+ CREATE TABLE IF NOT EXISTS metadata (
102
+ id INTEGER PRIMARY KEY,
103
+ file_id INTEGER,
104
+ meta_key TEXT,
105
+ meta_value TEXT,
106
+ FOREIGN KEY (file_id) REFERENCES files (id)
107
+ )
108
+ ''')
109
+ self.cursor.execute('''
110
+ CREATE TABLE IF NOT EXISTS chunks (
111
+ id INTEGER PRIMARY KEY,
112
+ file_id INTEGER,
113
+ chunk_index INTEGER,
114
+ chunk_text TEXT,
115
+ chunk_size INTEGER,
116
+ FOREIGN KEY (file_id) REFERENCES files (id)
117
+ )
118
+ ''')
119
+ self.cursor.execute('''
120
+ CREATE TABLE IF NOT EXISTS insights (
121
+ id INTEGER PRIMARY KEY,
122
+ file_id INTEGER,
123
+ insight_type TEXT,
124
+ insight_text TEXT,
125
+ confidence REAL,
126
+ timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
127
+ is_speculative BOOLEAN,
128
+ FOREIGN KEY (file_id) REFERENCES files (id)
129
+ )
130
+ ''')
131
+ self.cursor.execute('''
132
+ CREATE TABLE IF NOT EXISTS analytics (
133
+ id INTEGER PRIMARY KEY,
134
+ file_id INTEGER,
135
+ analysis_type TEXT,
136
+ analysis_result TEXT,
137
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
138
+ FOREIGN KEY (file_id) REFERENCES files (id)
139
+ )
140
+ ''')
141
+ self.connection.commit()
142
+ print("Successfully created database tables")
143
+ except sqlite3.Error as e:
144
+ print(f"Error creating tables: {e}")
145
+ raise
146
+
147
+ def add_file(self, filename, original, path, size, file_type):
148
+ try:
149
+ self.cursor.execute('''
150
+ INSERT INTO files (filename, original, path, size, file_type)
151
+ VALUES (?, ?, ?, ?, ?)
152
+ ''', (filename, original, path, size, file_type))
153
+ self.connection.commit()
154
+ return self.cursor.lastrowid
155
+ except sqlite3.Error as e:
156
+ print(f"Database Error in add_file: {e}")
157
+ self.connection.rollback()
158
+ return None
159
+
160
+ def add_metadata(self, file_id, meta_key, meta_value):
161
+ try:
162
+ self.cursor.execute('''
163
+ INSERT INTO metadata (file_id, meta_key, meta_value)
164
+ VALUES (?, ?, ?)
165
+ ''', (file_id, meta_key, meta_value))
166
+ self.connection.commit()
167
+ except sqlite3.Error as e:
168
+ print(f"Database Error in add_metadata: {e}")
169
+ self.connection.rollback()
170
+
171
+ def add_chunk(self, file_id, chunk_index, chunk_text, chunk_size):
172
+ try:
173
+ self.cursor.execute('''
174
+ INSERT INTO chunks (file_id, chunk_index, chunk_text, chunk_size)
175
+ VALUES (?, ?, ?, ?)
176
+ ''', (file_id, chunk_index, chunk_text, chunk_size))
177
+ self.connection.commit()
178
+ except sqlite3.Error as e:
179
+ print(f"Database Error in add_chunk: {e}")
180
+ self.connection.rollback()
181
+
182
+ def add_insight(self, file_id, insight_type, insight_text, confidence, is_speculative):
183
+ try:
184
+ self.cursor.execute('''
185
+ INSERT INTO insights (file_id, insight_type, insight_text, confidence, is_speculative)
186
+ VALUES (?, ?, ?, ?, ?)
187
+ ''', (file_id, insight_type, insight_text, confidence, is_speculative))
188
+ self.connection.commit()
189
+ except sqlite3.Error as e:
190
+ print(f"Database Error in add_insight: {e}")
191
+ self.connection.rollback()
192
+
193
+ def add_analysis(self, file_id, analysis_type, analysis_result):
194
+ try:
195
+ self.cursor.execute('''
196
+ INSERT INTO analytics (file_id, analysis_type, analysis_result)
197
+ VALUES (?, ?, ?)
198
+ ''', (file_id, analysis_type, analysis_result))
199
+ self.connection.commit()
200
+ except sqlite3.Error as e:
201
+ print(f"Database Error in add_analysis: {e}")
202
+ self.connection.rollback()
203
+
204
+ def get_file_by_id(self, file_id):
205
+ try:
206
+ self.cursor.execute('''
207
+ SELECT * FROM files WHERE id = ?
208
+ ''', (file_id,))
209
+ return self.cursor.fetchone()
210
+ except sqlite3.Error as e:
211
+ print(f"Database Error in get_file_by_id: {e}")
212
+ return None
213
+
214
+ def get_analysis_by_file_id(self, file_id):
215
+ try:
216
+ self.cursor.execute('''
217
+ SELECT analysis_type, analysis_result
218
+ FROM analytics
219
+ WHERE file_id = ?
220
+ ''', (file_id,))
221
+ return self.cursor.fetchall()
222
+ except sqlite3.Error as e:
223
+ print(f"Database Error in get_analysis_by_file_id: {e}")
224
+ return []
225
+
226
+ def get_insights_by_file_id(self, file_id):
227
+ try:
228
+ self.cursor.execute('''
229
+ SELECT insight_type, insight_text, confidence
230
+ FROM insights
231
+ WHERE file_id = ?
232
+ ''', (file_id,))
233
+ return self.cursor.fetchall()
234
+ except sqlite3.Error as e:
235
+ print(f"Database Error in get_insights_by_file_id: {e}")
236
+ return []
237
+
238
+ def close(self):
239
+ if hasattr(self, 'connection') and self.connection:
240
+ self.connection.close()
241
+
242
+ class FileProcessor:
243
+ def __init__(self, db_manager):
244
+ self.db_manager = db_manager
245
+ self.sia = SentimentIntensityAnalyzer()
246
+
247
+ def process_file(self, file):
248
+ try:
249
+ # Write the file content to a temporary file
250
+ temp_dir = tempfile.mkdtemp()
251
+ file_path = os.path.join(temp_dir, os.path.basename(file.name))
252
+
253
+ import shutil
254
+ shutil.copy(file.name, file_path)
255
+
256
+ file_size = os.path.getsize(file_path)
257
+ file_extension = os.path.splitext(file.name)[1].lower()
258
+ if file_extension == '.txt':
259
+ file_type = 'text'
260
+ elif file_extension == '.csv':
261
+ file_type = 'csv'
262
+ elif file_extension == '.json':
263
+ file_type = 'json'
264
+ else:
265
+ raise ValueError(f"Unsupported file type: {file_extension}")
266
+ file_id = self.db_manager.add_file(
267
+ filename=os.path.basename(file.name),
268
+ original=os.path.basename(file.name),
269
+ path=file_path,
270
+ size=file_size,
271
+ file_type=file_type
272
+ )
273
+ if not file_id:
274
+ raise Exception("Failed to add file to database")
275
+ chunk_count = 0
276
+ if file_type == 'text':
277
+ chunk_count = self._process_text_file(file_path, file_id)
278
+ elif file_type == 'csv':
279
+ chunk_count = self._process_csv_file(file_path, file_id)
280
+ elif file_type == 'json':
281
+ chunk_count = self._process_json_file(file_path, file_id)
282
+ return file_id, chunk_count
283
+ except Exception as e:
284
+ print(f"Error processing file: {e}")
285
+ print(traceback.format_exc())
286
+ raise
287
+
288
+ def _process_text_file(self, file_path, file_id):
289
+ try:
290
+ with open(file_path, 'r', encoding='utf-8') as f:
291
+ text = f.read()
292
+
293
+ self.db_manager.add_metadata(file_id, 'character_count', str(len(text)))
294
+ self.db_manager.add_metadata(file_id, 'word_count', str(len(text.split())))
295
+
296
+ chunks = text.split('\n\n')
297
+ for i, chunk in enumerate(chunks):
298
+ if chunk.strip():
299
+ self.db_manager.add_chunk(file_id, i, chunk, len(chunk))
300
+
301
+ sentiment = self.sia.polarity_scores(text)
302
+ sentiment_result = json.dumps(sentiment)
303
+ self.db_manager.add_analysis(file_id, 'sentiment_analysis', sentiment_result)
304
+
305
+ tokens = word_tokenize(text)
306
+ stop_words = set(stopwords.words('english'))
307
+ filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
308
+
309
+ token_analysis = {
310
+ 'total_tokens': len(tokens),
311
+ 'unique_tokens': len(set(tokens)),
312
+ 'tokens_without_stopwords': len(filtered_tokens),
313
+ 'sample_tokens': filtered_tokens[:20] if len(filtered_tokens) > 20 else filtered_tokens
314
+ }
315
+
316
+ self.db_manager.add_analysis(file_id, 'tokenization', json.dumps(token_analysis))
317
+
318
+ if sentiment['compound'] > 0.5:
319
+ self.db_manager.add_insight(
320
+ file_id, 'sentiment', 'Text has a very positive tone',
321
+ sentiment['compound'], False
322
+ )
323
+ elif sentiment['compound'] < -0.5:
324
+ self.db_manager.add_insight(
325
+ file_id, 'sentiment', 'Text has a very negative tone',
326
+ abs(sentiment['compound']), False
327
+ )
328
+
329
+ return len(chunks)
330
+
331
+ except Exception as e:
332
+ print(f"Error processing text file: {e}")
333
+ print(traceback.format_exc())
334
+ raise
335
+
336
+ def _process_csv_file(self, file_path, file_id):
337
+ try:
338
+ df = pd.read_csv(file_path)
339
+
340
+ self.db_manager.add_metadata(file_id, 'row_count', str(len(df)))
341
+ self.db_manager.add_metadata(file_id, 'column_count', str(len(df.columns)))
342
+ self.db_manager.add_metadata(file_id, 'columns', ','.join(df.columns))
343
+
344
+ chunk_size = 100
345
+ chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]
346
+
347
+ for i, chunk in enumerate(chunks):
348
+ chunk_text = chunk.to_json(orient='records')
349
+ self.db_manager.add_chunk(file_id, i, chunk_text, len(chunk_text))
350
+
351
+ numeric_columns = df.select_dtypes(include=['number']).columns
352
+ if len(numeric_columns) > 0:
353
+ stats = df[numeric_columns].describe().to_json()
354
+ self.db_manager.add_analysis(file_id, 'statistical_analysis', stats)
355
+
356
+ if len(numeric_columns) >= 2 and len(df) >= 20:
357
+ try:
358
+ target_col = numeric_columns[0]
359
+ feature_cols = [col for col in numeric_columns if col != target_col]
360
+
361
+ X = df[feature_cols]
362
+ y = df[target_col]
363
+
364
+ X_train, X_test, y_train, y_test = train_test_split(
365
+ X, y, test_size=0.2, random_state=42
366
+ )
367
+
368
+ model = RandomForestRegressor(n_estimators=50, random_state=42)
369
+ model.fit(X_train, y_train)
370
+
371
+ y_pred = model.predict(X_test)
372
+ mse = mean_squared_error(y_test, y_pred)
373
+ r2 = r2_score(y_test, y_pred)
374
+
375
+ model_results = {
376
+ 'target_column': target_col,
377
+ 'feature_columns': feature_cols,
378
+ 'mean_squared_error': mse,
379
+ 'r2_score': r2,
380
+ 'feature_importance': {col: imp for col, imp in zip(feature_cols, model.feature_importances_)}
381
+ }
382
+
383
+ self.db_manager.add_analysis(file_id, 'predictive_model', json.dumps(model_results))
384
+
385
+ if r2 > 0.7:
386
+ self.db_manager.add_insight(
387
+ file_id, 'prediction',
388
+ f'Strong predictive relationship found between {target_col} and other variables',
389
+ r2, False
390
+ )
391
+ elif r2 > 0.3:
392
+ self.db_manager.add_insight(
393
+ file_id, 'prediction',
394
+ f'Moderate predictive relationship found between {target_col} and other variables',
395
+ r2, False
396
+ )
397
+ except Exception as e:
398
+ print(f"Could not create predictive model: {e}")
399
+
400
+ return len(chunks)
401
+
402
+ except Exception as e:
403
+ print(f"Error processing CSV file: {e}")
404
+ print(traceback.format_exc())
405
+ raise
406
+
407
+ def _process_json_file(self, file_path, file_id):
408
+ try:
409
+ with open(file_path, 'r', encoding='utf-8') as f:
410
+ data = json.load(f)
411
+
412
+ json_str = json.dumps(data)
413
+
414
+ if isinstance(data, list):
415
+ self.db_manager.add_metadata(file_id, 'item_count', str(len(data)))
416
+ self.db_manager.add_metadata(file_id, 'structure', 'array')
417
+ elif isinstance(data, dict):
418
+ self.db_manager.add_metadata(file_id, 'key_count', str(len(data.keys())))
419
+ self.db_manager.add_metadata(file_id, 'structure', 'object')
420
+ self.db_manager.add_metadata(file_id, 'keys', ','.join(data.keys()))
421
+
422
+ chunks = []
423
+ if isinstance(data, list):
424
+ chunk_size = 10
425
+ chunks = [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]
426
+ else:
427
+ chunks = [data]
428
+
429
+ for i, chunk in enumerate(chunks):
430
+ chunk_text = json.dumps(chunk)
431
+ self.db_manager.add_chunk(file_id, i, chunk_text, len(chunk_text))
432
+
433
+ structure_analysis = self._analyze_json_structure(data)
434
+ self.db_manager.add_analysis(file_id, 'structure_analysis', json.dumps(structure_analysis))
435
+
436
+ return len(chunks)
437
+
438
+ except Exception as e:
439
+ print(f"Error processing JSON file: {e}")
440
+ print(traceback.format_exc())
441
+ raise
442
+
443
+ def _analyze_json_structure(self, data, max_depth=3, current_depth=0):
444
+ if current_depth >= max_depth:
445
+ return "..."
446
+
447
+ if isinstance(data, dict):
448
+ result = {}
449
+ for k, v in list(data.items())[:10]:
450
+ result[k] = self._analyze_json_structure(v, max_depth, current_depth + 1)
451
+ if len(data) > 10:
452
+ result["..."] = f"{len(data) - 10} more keys"
453
+ return result
454
+ elif isinstance(data, list):
455
+ if len(data) == 0:
456
+ return []
457
+ if len(data) > 5:
458
+ return [
459
+ self._analyze_json_structure(data[0], max_depth, current_depth + 1),
460
+ "...",
461
+ f"{len(data)} items total"
462
+ ]
463
+ return [self._analyze_json_structure(item, max_depth, current_depth + 1) for item in data]
464
+ else:
465
+ return type(data).__name__
466
+
467
+ class DataDeityApp:
468
+ def __init__(self):
469
+ self.db_manager = DatabaseManager()
470
+ self.file_processor = FileProcessor(self.db_manager)
471
+ self.processed_data = {}
472
+
473
+ def run(self, file):
474
+ try:
475
+ file_id, chunk_count = self.file_processor.process_file(file)
476
+ self.processed_data[file.name] = file_id
477
+ return chunk_count
478
+ except Exception as e:
479
+ print(f"Error in app.run: {e}")
480
+ print(traceback.format_exc())
481
+ return 0
482
+
483
+ def get_analysis_results(self, file_id):
484
+ try:
485
+ file_info = self.db_manager.get_file_by_id(file_id)
486
+ if not file_info:
487
+ return {"Error": "File not found"}
488
+
489
+ file_type = file_info[5]
490
+
491
+ analyses = self.db_manager.get_analysis_by_file_id(file_id)
492
+ insights = self.db_manager.get_insights_by_file_id(file_id)
493
+
494
+ results = {}
495
+
496
+ results["File Information"] = f"""
497
+ <div class="file-info">
498
+ <p><strong>Filename:</strong> {file_info[1]}</p>
499
+ <p><strong>Size:</strong> {file_info[4]} bytes</p>
500
+ <p><strong>Type:</strong> {file_info[5]}</p>
501
+ </div>
502
+ """
503
+
504
+ if file_type == 'text':
505
+ for analysis_type, analysis_result in analyses:
506
+ if analysis_type == 'sentiment_analysis':
507
+ sentiment = json.loads(analysis_result)
508
+ results["Sentiment Analysis"] = f"""
509
+ <div class="sentiment-analysis">
510
+ <p><strong>Compound Score:</strong> {sentiment['compound']:.4f}</p>
511
+ <p><strong>Positive:</strong> {sentiment['pos']:.4f}</p>
512
+ <p><strong>Neutral:</strong> {sentiment['neu']:.4f}</p>
513
+ <p><strong>Negative:</strong> {sentiment['neg']:.4f}</p>
514
+ <div class="sentiment-bar" style="background: linear-gradient(to right,
515
+ #ff4d4d 0%,
516
+ #ff4d4d {sentiment['neg']*100}%,
517
+ #f2f2f2 {sentiment['neg']*100}%,
518
+ #f2f2f2 {(sentiment['neg']+sentiment['neu'])*100}%,
519
+ #4dff4d {(sentiment['neg']+sentiment['neu'])*100}%,
520
+ #4dff4d 100%);
521
+ height: 20px; border-radius: 5px;">
522
+ </div>
523
+ </div>
524
+ """
525
+ elif analysis_type == 'tokenization':
526
+ token_data = json.loads(analysis_result)
527
+ results["Text Tokenization"] = f"""
528
+ <div class="tokenization">
529
+ <p><strong>Total Tokens:</strong> {token_data['total_tokens']}</p>
530
+ <p><strong>Unique Tokens:</strong> {token_data['unique_tokens']}</p>
531
+ <p><strong>Tokens without Stopwords:</strong> {token_data['tokens_without_stopwords']}</p>
532
+ <p><strong>Sample Tokens:</strong> {', '.join(token_data['sample_tokens'])}</p>
533
+ </div>
534
+ """
535
+
536
+ elif file_type == 'csv':
537
+ for analysis_type, analysis_result in analyses:
538
+ if analysis_type == 'statistical_analysis':
539
+ stats = json.loads(analysis_result) # stats is now a dictionary
540
+ stats_html = "<div class='stats-table'><table>"
541
+ stats_html += "<tr><th>Statistic</th>"
542
+
543
+ # Corrected line: stats is already a dict, no need for json.loads()
544
+ columns = list(stats.keys())
545
+ for col in columns:
546
+ stats_html += f"<th>{col}</th>"
547
+ stats_html += "</tr>"
548
+
549
+ metrics = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
550
+ for metric in metrics:
551
+ stats_html += f"<tr><td>{metric}</td>"
552
+ for col in columns:
553
+ # Corrected line: stats is already a dict, col_stats = stats[col]
554
+ col_stats = stats[col]
555
+ if metric in col_stats:
556
+ value = col_stats[metric]
557
+ stats_html += f"<td>{value:.4f if isinstance(value, float) else value}</td>"
558
+ else:
559
+ stats_html += "<td>N/A</td>"
560
+ stats_html += "</tr>"
561
+
562
+ stats_html += "</table></div>"
563
+ results["Statistical Analysis"] = stats_html
564
+
565
+ elif analysis_type == 'predictive_model':
566
+ model_data = json.loads(analysis_result)
567
+ results["Predictive Model"] = f"""
568
+ <div class="predictive-model">
569
+ <p><strong>Target Column:</strong> {model_data['target_column']}</p>
570
+ <p><strong>Feature Columns:</strong> {', '.join(model_data['feature_columns'])}</p>
571
+ <p><strong>Model Performance:</strong></p>
572
+ <ul>
573
+ <li>Mean Squared Error: {model_data['mean_squared_error']:.4f}</li>
574
+ <li>R² Score: {model_data['r2_score']:.4f}</li>
575
+ </ul>
576
+ <p><strong>Feature Importance:</strong></p>
577
+ <div class="feature-importance">
578
+ {''.join([f'<div style="margin-bottom:5px;"><span>{feat}</span>: <div style="display:inline-block;width:{imp*100}%;background-color:#4CAF50;height:10px;"></div> {imp:.4f}</div>' for feat, imp in sorted(model_data['feature_importance'].items(), key=lambda x: x[1], reverse=True)])}
579
+ </div>
580
+ </div>
581
+ """
582
+
583
+ elif file_type == 'json':
584
+ for analysis_type, analysis_result in analyses:
585
+ if analysis_type == 'structure_analysis':
586
+ structure = json.loads(analysis_result)
587
+ results["JSON Structure"] = f"""
588
+ <div class="json-data">
589
+ <p><strong>Structure Overview:</strong></p>
590
+ <pre>{json.dumps(structure, indent=2)}</pre>
591
+ </div>
592
+ """
593
+
594
+ if insights:
595
+ insights_html = "<div class='insights'><h4>Key Insights</h4><ul>"
596
+ for insight_type, insight_text, confidence in insights:
597
+ insights_html += f"<li><strong>{insight_type.title()}:</strong> {insight_text} (Confidence: {confidence:.2f})</li>"
598
+ insights_html += "</ul></div>"
599
+ results["Insights"] = insights_html
600
+
601
+ return results
602
+
603
+ except Exception as e:
604
+ print(f"Error getting analysis results: {e}")
605
+ print(traceback.format_exc())
606
+ return {"Error": str(e)}
607
+
608
+ def generate_report(self, file_id):
609
+ try:
610
+ file_info = self.db_manager.get_file_by_id(file_id)
611
+ if not file_info:
612
+ return None
613
+
614
+ filename = file_info[1]
615
+ file_type = file_info[5]
616
+
617
+ os.makedirs('reports', exist_ok=True)
618
+
619
+ report_filename = f"report_{os.path.splitext(filename)[0]}_{int(time.time())}.html"
620
+ report_path = os.path.join('reports', report_filename)
621
+
622
+ analyses = self.db_manager.get_analysis_by_file_id(file_id)
623
+ insights = self.db_manager.get_insights_by_file_id(file_id)
624
+
625
+ with open(report_path, 'w', encoding='utf-8') as f:
626
+ f.write(f"""<!DOCTYPE html>
627
+ <html>
628
+ <head>
629
+ <title>Analysis Report: {filename}</title>
630
+ <style>
631
+ body {{ font-family: Arial, sans-serif; margin: 20px; }}
632
+ h1, h2, h3 {{ color: #333; }}
633
+ .container {{ max-width: 1200px; margin: 0 auto; }}
634
+ .section {{ margin-bottom: 30px; padding: 20px; border: 1px solid #ddd; border-radius: 5px; }}
635
+ .file-info {{ background-color: #f9f9f9; padding: 15px; border-radius: 5px; }}
636
+ table {{ border-collapse: collapse; width: 100%; }}
637
+ th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
638
+ th {{ background-color: #f2f2f2; }}
639
+ pre {{ background-color: #f8f9fa; padding: 15px; border-radius: 5px; overflow-x: auto; }}
640
+ .sentiment-bar {{ margin-top: 10px; }}
641
+ .insights {{ background-color: #f0f7ff; padding: 15px; border-radius: 5px; }}
642
+ </style>
643
+ </head>
644
+ <body>
645
+ <div class="container">
646
+ <h1>Analysis Report: {filename}</h1>
647
+ <div class="section">
648
+ <h2>File Information</h2>
649
+ <div class="file-info">
650
+ <p><strong>Filename:</strong> {filename}</p>
651
+ <p><strong>Size:</strong> {file_info[4]} bytes</p>
652
+ <p><strong>Type:</strong> {file_type}</p>
653
+ <p><strong>Upload Date:</strong> {file_info[6]}</p>
654
+ </div>
655
+ </div>
656
+ """)
657
+
658
+ if file_type == 'text':
659
+ for analysis_type, analysis_result in analyses:
660
+ if analysis_type == 'sentiment_analysis':
661
+ sentiment = json.loads(analysis_result)
662
+ f.write(f"""
663
+ <div class="section">
664
+ <h2>Sentiment Analysis</h2>
665
+ <p><strong>Compound Score:</strong> {sentiment['compound']:.4f}</p>
666
+ <p><strong>Positive:</strong> {sentiment['pos']:.4f}</p>
667
+ <p><strong>Neutral:</strong> {sentiment['neu']:.4f}</p>
668
+ <p><strong>Negative:</strong> {sentiment['neg']:.4f}</p>
669
+ <div class="sentiment-bar" style="background: linear-gradient(to right,
670
+ #ff4d4d 0%,
671
+ #ff4d4d {sentiment['neg']*100}%,
672
+ #f2f2f2 {sentiment['neg']*100}%,
673
+ #f2f2f2 {(sentiment['neg']+sentiment['neu'])*100}%,
674
+ #4dff4d {(sentiment['neg']+sentiment['neu'])*100}%,
675
+ #4dff4d 100%);
676
+ height: 20px; border-radius: 5px;">
677
+ </div>
678
+ </div>
679
+ """)
680
+ elif analysis_type == 'tokenization':
681
+ token_data = json.loads(analysis_result)
682
+ f.write(f"""
683
+ <div class="section">
684
+ <h2>Text Tokenization</h2>
685
+ <p><strong>Total Tokens:</strong> {token_data['total_tokens']}</p>
686
+ <p><strong>Unique Tokens:</strong> {token_data['unique_tokens']}</p>
687
+ <p><strong>Tokens without Stopwords:</strong> {token_data['tokens_without_stopwords']}</p>
688
+ <p><strong>Sample Tokens:</strong> {', '.join(token_data['sample_tokens'])}</p>
689
+ </div>
690
+ """)
691
+
692
+ elif file_type == 'csv':
693
+ for analysis_type, analysis_result in analyses:
694
+ if analysis_type == 'statistical_analysis':
695
+ stats = json.loads(analysis_result) # stats is now a dictionary
696
+ f.write("""
697
+ <div class="section">
698
+ <h2>Statistical Analysis</h2>
699
+ <table>
700
+ <tr>
701
+ <th>Statistic</th>
702
+ """)
703
+
704
+ # Corrected line: stats is already a dict, no need for json.loads()
705
+ columns = list(stats.keys())
706
+ for col in columns:
707
+ f.write(f"<th>{col}</th>")
708
+ f.write("</tr>")
709
+
710
+ metrics = ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
711
+ for metric in metrics:
712
+ f.write(f"<tr><td>{metric}</td>")
713
+ for col in columns:
714
+ # Corrected line: stats is already a dict, col_stats = stats[col]
715
+ col_stats = stats[col]
716
+ if metric in col_stats:
717
+ value = col_stats[metric]
718
+ f.write(f"<td>{value:.4f if isinstance(value, float) else value}</td>")
719
+ else:
720
+ f.write("<td>N/A</td>")
721
+ f.write("</tr>")
722
+
723
+ f.write("""
724
+ </table>
725
+ </div>
726
+ """)
727
+
728
+ elif analysis_type == 'predictive_model':
729
+ model_data = json.loads(analysis_result)
730
+ f.write(f"""
731
+ <div class="section">
732
+ <h2>Predictive Model</h2>
733
+ <p><strong>Target Column:</strong> {model_data['target_column']}</p>
734
+ <p><strong>Feature Columns:</strong> {', '.join(model_data['feature_columns'])}</p>
735
+ <p><strong>Model Performance:</strong></p>
736
+ <ul>
737
+ <li>Mean Squared Error: {model_data['mean_squared_error']:.4f}</li>
738
+ <li>R² Score: {model_data['r2_score']:.4f}</li>
739
+ </ul>
740
+ <p><strong>Feature Importance:</strong></p>
741
+ <div class="feature-importance">
742
+ {''.join([f'<div style="margin-bottom:5px;"><span>{feat}</span>: <div style="display:inline-block;width:{imp*100}%;background-color:#4CAF50;height:10px;"></div> {imp:.4f}</div>' for feat, imp in sorted(model_data['feature_importance'].items(), key=lambda x: x[1], reverse=True)])}
743
+ </div>
744
+ </div>
745
+ """)
746
+
747
+ elif file_type == 'json':
748
+ for analysis_type, analysis_result in analyses:
749
+ if analysis_type == 'structure_analysis':
750
+ structure = json.loads(analysis_result)
751
+ f.write(f"""
752
+ <div class="section">
753
+ <h2>JSON Structure</h2>
754
+ <pre>{json.dumps(structure, indent=2)}</pre>
755
+ </div>
756
+ """)
757
+
758
+ if insights:
759
+ f.write("""
760
+ <div class="section">
761
+ <h2>Key Insights</h2>
762
+ <div class="insights">
763
+ <ul>
764
+ """)
765
+ for insight_type, insight_text, confidence in insights:
766
+ f.write(f"<li><strong>{insight_type.title()}:</strong> {insight_text} (Confidence: {confidence:.2f})</li>")
767
+ f.write("""
768
+ </ul>
769
+ </div>
770
+ </div>
771
+ """)
772
+
773
+ f.write("""
774
+ </div>
775
+ <footer style="text-align: center; margin-top: 30px; color: #777;">
776
+ <p>Generated on {datetime_now}</p>
777
+ </footer>
778
+ </body>
779
+ </html>
780
+ """.format(datetime_now=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
781
+
782
+ return report_path
783
+
784
+ except Exception as e:
785
+ print(f"Error generating report: {e}")
786
+ print(traceback.format_exc())
787
+ return None
788
+
789
+ def cleanup(self):
790
+ try:
791
+ self.db_manager.close()
792
+ except Exception as e:
793
+ print(f"Error during cleanup: {e}")
794
+
795
+ def main():
796
+ import time
797
+ import datetime
798
+
799
+ app = DataDeityApp()
800
+
801
+ custom_css = """
802
+ body {
803
+ font-family: 'Arial', sans-serif;
804
+ }
805
+ .analysis-results {
806
+ max-height: 800px;
807
+ overflow-y: auto;
808
+ padding: 15px;
809
+ border-radius: 5px;
810
+ border: 1px solid #eee;
811
+ }
812
+ .sentiment-analysis, .tokenization, .json-data {
813
+ margin: 15px 0;
814
+ padding: 15px;
815
+ border: 1px solid #eee;
816
+ border-radius: 5px;
817
+ }
818
+ pre {
819
+ background-color: #f8f9fa;
820
+ padding: 15px;
821
+ border-radius: 5px;
822
+ overflow-x: auto;
823
+ }
824
+ .stats-table table {
825
+ width: 100%;
826
+ border-collapse: collapse;
827
+ }
828
+ .stats-table th, .stats-table td {
829
+ border: 1px solid #ddd;
830
+ padding: 8px;
831
+ text-align: left;
832
+ }
833
+ .stats-table th {
834
+ background-color: #f2f2f2;
835
+ }
836
+ .error-message {
837
+ color: #d9534f;
838
+ padding: 15px;
839
+ border: 1px solid #d9534f;
840
+ border-radius: 5px;
841
+ }
842
+ .feature-importance {
843
+ margin-top: 10px;
844
+ }
845
+ .insights {
846
+ background-color: #f0f7ff;
847
+ padding: 15px;
848
+ border-radius: 5px;
849
+ }
850
+ """
851
+
852
+ def process_and_display(file):
853
+ try:
854
+ if file is None:
855
+ return """
856
+ <div class="error-message">
857
+ <h2>No File Selected</h2>
858
+ <p>Please upload a file to analyze.</p>
859
+ </div>
860
+ """
861
+
862
+ chunk_count = app.run(file)
863
+ file_id = app.processed_data.get(file.name)
864
+
865
+ if file_id is not None:
866
+ analysis_results = app.get_analysis_results(file_id)
867
+
868
+ output_html = f"""
869
+ <div class="analysis-results">
870
+ <h2>Analysis Results for {file.name}</h2>
871
+ <p>Processed {chunk_count} chunks</p>
872
+ """
873
+
874
+ for key, value in analysis_results.items():
875
+ output_html += f"""
876
+ <div class="result-section">
877
+ <h3>{key}</h3>
878
+ {value}
879
+ </div>
880
+ """
881
+
882
+ output_html += "</div>"
883
+ return output_html
884
+ else:
885
+ return f"""
886
+ <div class="error-message">
887
+ <h2>Processing Error</h2>
888
+ <p>Failed to process file: {file.name}</p>
889
+ <p>Chunks processed: {chunk_count}</p>
890
+ </div>
891
+ """
892
+ except Exception as e:
893
+ print(f"Error in process_and_display: {e}")
894
+ print(traceback.format_exc())
895
+ return f"""
896
+ <div class="error-message">
897
+ <h2>Error</h2>
898
+ <p>An error occurred while processing the file: {str(e)}</p>
899
+ </div>
900
+ """
901
+
902
+ def generate_and_download_report(file):
903
+ try:
904
+ if file is None:
905
+ return None
906
+
907
+ file_id = app.processed_data.get(file.name)
908
+ if file_id is not None:
909
+ report_path = app.generate_report(file_id)
910
+ if report_path:
911
+ return report_path
912
+ return None
913
+ except Exception as e:
914
+ print(f"Error generating report: {e}")
915
+ print(traceback.format_exc())
916
+ return None
917
+
918
+ with gr.Blocks(css=custom_css) as demo:
919
+ gr.Markdown("""
920
+ # Advanced File Processing & Analysis Application
921
+
922
+ This application provides comprehensive analysis of text, CSV, and JSON files.
923
+
924
+ ### Supported File Types:
925
+ - Text Files (.txt): Sentiment analysis and text tokenization
926
+ - CSV Files (.csv): Statistical analysis and predictive modeling
927
+ - JSON Files (.json): Structure analysis and data exploration
928
+
929
+ ### Features:
930
+ - Automated data processing and chunking
931
+ - Advanced analytics and insights
932
+ - Downloadable analysis reports
933
+ """)
934
+
935
+ with gr.Row():
936
+ file_input = gr.File(label="Upload a file (.txt, .csv, or .json)")
937
+
938
+ with gr.Row():
939
+ analyze_btn = gr.Button("Analyze File", variant="primary")
940
+ download_btn = gr.Button("Download Report", variant="secondary")
941
+
942
+ output = gr.HTML(label="Analysis Results")
943
+ report_output = gr.File(label="Download Report")
944
+
945
+ analyze_btn.click(
946
+ fn=process_and_display,
947
+ inputs=[file_input],
948
+ outputs=[output]
949
+ )
950
+
951
+ download_btn.click(
952
+ fn=generate_and_download_report,
953
+ inputs=[file_input],
954
+ outputs=[report_output]
955
+ )
956
+
957
+ demo.launch(share=True)
958
+
959
+ if __name__ == "__main__":
960
+ main()
huggingface.yml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version: 0.1
2
+ docker:
3
+ image: acecalisto3/Dbgod
nltk_setup.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import nltk
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ # Configure logging
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
10
+ )
11
+ logger = logging.getLogger(__name__)
12
+
13
+ def setup_nltk():
14
+ """
15
+ Set up NLTK data in a local directory to avoid permission issues.
16
+ Downloads required NLTK packages if they're not already present.
17
+ """
18
+ try:
19
+ # Create a local directory for NLTK data
20
+ nltk_data_dir = Path('./nltk_data')
21
+ nltk_data_dir.mkdir(exist_ok=True)
22
+
23
+ # Add the local directory to NLTK's data path
24
+ nltk.data.path.append(str(nltk_data_dir))
25
+
26
+ # Required NLTK packages
27
+ required_packages = ['punkt', 'vader_lexicon', 'stopwords']
28
+
29
+ for package in required_packages:
30
+ try:
31
+ # Try to load the package first
32
+ nltk.data.find(f'tokenizers/{package}' if package == 'punkt'
33
+ else f'sentiment/{package}' if package == 'vader_lexicon'
34
+ else f'corpora/{package}')
35
+ logger.info(f"Package '{package}' is already downloaded")
36
+ except LookupError:
37
+ # If package is not found, download it
38
+ logger.info(f"Downloading package '{package}'...")
39
+ nltk.download(package, download_dir=str(nltk_data_dir))
40
+ logger.info(f"Successfully downloaded package '{package}'")
41
+
42
+ logger.info("NLTK setup completed successfully")
43
+ return True
44
+
45
+ except PermissionError as e:
46
+ logger.error(f"Permission error while setting up NLTK: {e}")
47
+ return False
48
+ except Exception as e:
49
+ logger.error(f"Unexpected error during NLTK setup: {e}")
50
+ return False
51
+
52
+ if __name__ == "__main__":
53
+ setup_nltk()
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ matplotlib
2
+ gradio
3
+ pandas
4
+ uvicorn
5
+ numpy
6
+ nltk
7
+ scikit-learn
8
+ seaborn
9
+ psycopg2-binary
10
+ watchdog
11
+ redis
12
+ beautifulsoup4
13
+ pymysql
14
+ pysqlite3-binary
15
+ statsmodels
16
+ pymongo
17
+ python-dotenv