datafreak commited on
Commit
93d997f
Β·
verified Β·
1 Parent(s): 2b4ae34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +520 -329
app.py CHANGED
@@ -1,329 +1,520 @@
1
- import gradio as gr
2
- import os
3
- from pathlib import Path
4
- from pinecone import Pinecone
5
- from typing import List, Tuple
6
- import tempfile
7
- import shutil
8
- from dotenv import load_dotenv
9
- import time
10
-
11
- # Load environment variables
12
- load_dotenv()
13
-
14
- # Validate required environment variables
15
- required_env_vars = ["PINECONE_API_KEY"]
16
- missing_vars = [var for var in required_env_vars if not os.getenv(var)]
17
-
18
- if missing_vars:
19
- raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
20
-
21
- # Initialize Pinecone
22
- pinecone_api_key = os.getenv("PINECONE_API_KEY")
23
- pc = Pinecone(api_key=pinecone_api_key)
24
-
25
- # Create uploads directory
26
- UPLOAD_FOLDER = "uploads"
27
- os.makedirs(UPLOAD_FOLDER, exist_ok=True)
28
-
29
- def process_files_with_progress(files, *metadata_inputs, progress=gr.Progress()):
30
- """Process multiple files with individual metadata and show progress"""
31
- if not files:
32
- return "❌ Error: No files selected", ""
33
-
34
- if len(files) > 10:
35
- return "❌ Error: Maximum 10 files allowed at a time", ""
36
-
37
- try:
38
- results = []
39
- errors = []
40
- total_files = len(files)
41
-
42
- # Initialize Pinecone Assistant
43
- progress(0, desc="πŸ”§ Initializing Pinecone Assistant...")
44
- time.sleep(0.5) # Small delay to show the progress
45
- assistant_name = os.getenv("PINECONE_ASSISTANT_NAME", "gstminutes")
46
- assistant = pc.assistant.Assistant(assistant_name=assistant_name)
47
-
48
- # Process each file with its individual metadata
49
- for i, file_path in enumerate(files):
50
- try:
51
- filename = os.path.basename(file_path)
52
- progress((i / total_files), desc=f"πŸ“„ Processing {filename}... ({i+1}/{total_files})")
53
-
54
- # Get metadata for this specific file (3 fields per file: sections, keywords, description)
55
- sections_idx = i * 3
56
- keywords_idx = i * 3 + 1
57
- description_idx = i * 3 + 2
58
-
59
- if sections_idx < len(metadata_inputs):
60
- sections = metadata_inputs[sections_idx] or ""
61
- keywords = metadata_inputs[keywords_idx] or ""
62
- description = metadata_inputs[description_idx] or ""
63
- else:
64
- sections = keywords = description = ""
65
-
66
- # Skip if no metadata provided for this file
67
- if not sections.strip() and not keywords.strip() and not description.strip():
68
- errors.append({
69
- "filename": filename,
70
- "error": "❌ Error: No metadata provided"
71
- })
72
- continue
73
-
74
- # Prepare metadata for this file
75
- progress((i / total_files), desc=f"🏷️ Preparing metadata for {filename}...")
76
- metadata = {
77
- "sections": [s.strip() for s in sections.split(",") if s.strip()],
78
- "keywords": [k.strip() for k in keywords.split(",") if k.strip()],
79
- "description": description.strip()
80
- }
81
-
82
- # Copy to uploads directory
83
- progress((i / total_files), desc=f"πŸ“ Copying {filename} to uploads...")
84
- destination_path = os.path.join(UPLOAD_FOLDER, filename)
85
- shutil.copy2(file_path, destination_path)
86
-
87
- # Upload to Pinecone Assistant
88
- progress((i / total_files), desc=f"☁️ Uploading {filename} to Pinecone...")
89
- response = assistant.upload_file(
90
- file_path=destination_path,
91
- metadata=metadata,
92
- timeout=None
93
- )
94
-
95
- results.append({
96
- "filename": filename,
97
- "status": "βœ… Success",
98
- "metadata": metadata,
99
- "response": str(response)
100
- })
101
-
102
- except Exception as file_error:
103
- errors.append({
104
- "filename": os.path.basename(file_path),
105
- "error": f"❌ Error: {str(file_error)}"
106
- })
107
-
108
- # Final progress update
109
- progress(1.0, desc="βœ… Processing complete!")
110
- time.sleep(0.5)
111
-
112
- # Format results for display
113
- success_count = len(results)
114
- error_count = len(errors)
115
-
116
- status_message = f"πŸ“Š **Processing Complete**\n\n"
117
- status_message += f"βœ… **Successful uploads:** {success_count}\n"
118
- status_message += f"❌ **Failed uploads:** {error_count}\n"
119
- status_message += f"πŸ“ **Total files processed:** {len(files)}\n\n"
120
-
121
- # Detailed results
122
- detailed_results = "## πŸ“‹ **Detailed Results**\n\n"
123
-
124
- if results:
125
- detailed_results += "### βœ… **Successful Uploads:**\n"
126
- for result in results:
127
- detailed_results += f"- **{result['filename']}**\n"
128
- detailed_results += f" - Sections: {', '.join(result['metadata']['sections'])}\n"
129
- detailed_results += f" - Keywords: {', '.join(result['metadata']['keywords'])}\n"
130
- detailed_results += f" - Description: {result['metadata']['description']}\n\n"
131
-
132
- if errors:
133
- detailed_results += "### ❌ **Failed Uploads:**\n"
134
- for error in errors:
135
- detailed_results += f"- **{error['filename']}** - {error['error']}\n"
136
-
137
- return status_message, detailed_results, "βœ… **Processing completed successfully!**"
138
-
139
- except Exception as e:
140
- error_msg = f"❌ **Critical Error:** {str(e)}"
141
- return error_msg, "", "❌ **Processing failed with error**"
142
-
143
- def update_metadata_fields(files):
144
- """Update metadata fields based on uploaded files"""
145
- if not files:
146
- return [gr.update(visible=False)] * 30 # Hide all fields
147
-
148
- if len(files) > 10:
149
- # Show error and hide all fields
150
- return [gr.update(visible=False)] * 30
151
-
152
- updates = []
153
- for i in range(len(files)):
154
- if i < len(files):
155
- filename = os.path.basename(files[i])
156
- # Show 3 fields per file (sections, keywords, description)
157
- updates.extend([
158
- gr.update(visible=True, label=f"πŸ“‘ Sections for {filename}", placeholder="e.g., Introduction, Financial Data, Compliance"),
159
- gr.update(visible=True, label=f"πŸ” Keywords for {filename}", placeholder="e.g., GST, tax, compliance, revenue"),
160
- gr.update(visible=True, label=f"πŸ“ Description for {filename}", placeholder="Brief description of this document")
161
- ])
162
-
163
- # Hide remaining fields
164
- while len(updates) < 30:
165
- updates.append(gr.update(visible=False))
166
-
167
- return updates[:30]
168
-
169
- def clear_form():
170
- """Clear all form fields"""
171
- return [None] + [""] * 30 + ["", "", "🟒 **Ready to process documents**"]
172
-
173
- def start_processing():
174
- """Show processing started status"""
175
- return "πŸ”„ **Processing documents... Please wait**"
176
-
177
- def finish_processing():
178
- """Show processing finished status"""
179
- return "βœ… **Processing completed successfully!**"
180
-
181
- # Create Gradio interface
182
- with gr.Blocks(
183
- title="πŸ“„ Tax Document Ingestion System",
184
- theme=gr.themes.Soft(),
185
- css="""
186
- .gradio-container {
187
- max-width: 1200px !important;
188
- margin: auto;
189
- }
190
- .upload-container {
191
- border: 2px dashed #4CAF50;
192
- border-radius: 10px;
193
- padding: 20px;
194
- text-align: center;
195
- background-color: #f8f9fa;
196
- }
197
- """
198
- ) as app:
199
-
200
- gr.Markdown(
201
- """
202
- # πŸ“„ Tax Document Ingestion System
203
-
204
- Upload multiple documents with metadata to the Pinecone Assistant for GST Minutes processing.
205
-
206
- ## πŸš€ Features:
207
- - βœ… **Multiple file upload** - Select and upload multiple documents at once
208
- - 🏷️ **Metadata tagging** - Add sections, keywords, and descriptions
209
- - πŸ”„ **Batch processing** - All files processed with the same metadata
210
- - πŸ“Š **Detailed reporting** - See success/failure status for each file
211
-
212
- ---
213
- """
214
- )
215
-
216
- with gr.Row():
217
- with gr.Column(scale=1):
218
- gr.Markdown("### πŸ“ **File Upload**")
219
- files_input = gr.File(
220
- label="Select Documents (Max 10 files)",
221
- file_count="multiple",
222
- file_types=[".pdf", ".doc", ".docx", ".txt"],
223
- elem_classes=["upload-container"]
224
- )
225
-
226
- with gr.Column(scale=1):
227
- gr.Markdown("### 🏷️ **Document Metadata (Individual for Each File)**")
228
- gr.Markdown("*Upload files first, then metadata fields will appear for each document*")
229
-
230
- # Dynamic metadata fields container
231
- with gr.Column() as metadata_container:
232
- # Create 30 text fields (enough for 10 files with 3 fields each)
233
- metadata_fields = []
234
- for i in range(30):
235
- field = gr.Textbox(
236
- label=f"Field {i}",
237
- placeholder="",
238
- visible=False,
239
- lines=2
240
- )
241
- metadata_fields.append(field)
242
-
243
- with gr.Row():
244
- with gr.Column(scale=1):
245
- upload_btn = gr.Button(
246
- "πŸš€ Upload Documents to Pinecone Assistant",
247
- variant="primary",
248
- size="lg"
249
- )
250
-
251
- with gr.Column(scale=1):
252
- clear_btn = gr.Button(
253
- "πŸ—‘οΈ Clear Form",
254
- variant="secondary",
255
- size="lg"
256
- )
257
-
258
- # Processing status indicator
259
- with gr.Row():
260
- processing_status = gr.Markdown(
261
- value="🟒 **Ready to process documents**",
262
- visible=True
263
- )
264
-
265
- gr.Markdown("---")
266
-
267
- # Results section
268
- with gr.Row():
269
- with gr.Column():
270
- status_output = gr.Markdown(
271
- label="πŸ“Š Upload Status",
272
- value="*Ready to upload documents...*"
273
- )
274
-
275
- with gr.Row():
276
- with gr.Column():
277
- results_output = gr.Markdown(
278
- label="πŸ“‹ Detailed Results",
279
- value=""
280
- )
281
-
282
- # Event handlers
283
-
284
- # Update metadata fields when files are uploaded
285
- files_input.change(
286
- fn=update_metadata_fields,
287
- inputs=[files_input],
288
- outputs=metadata_fields
289
- )
290
-
291
- # Show processing status when upload starts
292
- upload_btn.click(
293
- fn=start_processing,
294
- outputs=[processing_status]
295
- ).then(
296
- fn=process_files_with_progress,
297
- inputs=[files_input] + metadata_fields,
298
- outputs=[status_output, results_output, processing_status]
299
- )
300
-
301
- clear_btn.click(
302
- fn=clear_form,
303
- outputs=[files_input] + metadata_fields + [status_output, results_output, processing_status]
304
- )
305
-
306
- # Footer
307
- gr.Markdown(
308
- """
309
- ---
310
-
311
- ### πŸ’‘ **Usage Tips:**
312
- - Select up to 10 PDF, DOC, DOCX, or TXT files at once
313
- - Upload files first, then fill individual metadata for each document
314
- - Each file gets its own sections, keywords, and description
315
- - Check the results section for upload status
316
-
317
- ### πŸ“ž **Support:**
318
- For issues or questions, contact the development team.
319
- """
320
- )
321
-
322
- if __name__ == "__main__":
323
- app.launch(
324
- server_name="0.0.0.0",
325
- server_port=7860,
326
- share=False,
327
- debug=True,
328
- show_error=True
329
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from pathlib import Path
4
+ from pinecone import Pinecone
5
+ from typing import List, Tuple
6
+ import tempfile
7
+ import shutil
8
+ from dotenv import load_dotenv
9
+ import time
10
+ from datetime import datetime
11
+ import json
12
+
13
+ # Load environment variables
14
+ load_dotenv()
15
+
16
+ # Validate required environment variables
17
+ required_env_vars = ["PINECONE_API_KEY"]
18
+ missing_vars = [var for var in required_env_vars if not os.getenv(var)]
19
+
20
+ if missing_vars:
21
+ raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
22
+
23
+ # Initialize Pinecone
24
+ pinecone_api_key = os.getenv("PINECONE_API_KEY")
25
+ pc = Pinecone(api_key=pinecone_api_key)
26
+
27
+ # Create uploads directory
28
+ UPLOAD_FOLDER = "uploads"
29
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
30
+
31
+ def list_uploaded_files(progress=gr.Progress()):
32
+ """List all files uploaded to Pinecone Assistant with their metadata and timestamps"""
33
+ try:
34
+ progress(0.1, desc="πŸ” Connecting to Pinecone Assistant...")
35
+ assistant_name = os.getenv("PINECONE_ASSISTANT_NAME", "gstminutes")
36
+ assistant = pc.assistant.Assistant(assistant_name=assistant_name)
37
+
38
+ progress(0.3, desc="πŸ“‹ Fetching file list...")
39
+ time.sleep(0.5)
40
+
41
+ # List all files in the assistant
42
+ files_response = assistant.list_files()
43
+
44
+ progress(0.7, desc="πŸ“Š Processing file information...")
45
+ time.sleep(0.3)
46
+
47
+ if not files_response or not hasattr(files_response, 'files') or not files_response.files:
48
+ progress(1.0, desc="βœ… Complete - No files found")
49
+ return "πŸ“‹ **No files found in Pinecone Assistant**", ""
50
+
51
+ files_list = files_response.files
52
+ total_files = len(files_list)
53
+
54
+ # Sort files by creation time (most recent first)
55
+ sorted_files = sorted(files_list, key=lambda x: getattr(x, 'created_on', ''), reverse=True)
56
+
57
+ progress(0.9, desc="πŸ“ Formatting results...")
58
+
59
+ # Create summary
60
+ summary = f"πŸ“Š **Files Summary**\n\n"
61
+ summary += f"πŸ“ **Total files:** {total_files}\n"
62
+ summary += f"πŸ• **Last updated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
63
+
64
+ # Create detailed file list
65
+ detailed_info = "## πŸ“‹ **File Details**\n\n"
66
+
67
+ for i, file_obj in enumerate(sorted_files, 1):
68
+ try:
69
+ # Get basic file information
70
+ file_name = getattr(file_obj, 'name', 'Unknown')
71
+ file_id = getattr(file_obj, 'id', 'Unknown')
72
+ file_size = getattr(file_obj, 'size', 0)
73
+ created_on = getattr(file_obj, 'created_on', 'Unknown')
74
+ updated_on = getattr(file_obj, 'updated_on', created_on)
75
+
76
+ # Format file size
77
+ if file_size > 1024 * 1024:
78
+ size_str = f"{file_size / (1024 * 1024):.2f} MB"
79
+ elif file_size > 1024:
80
+ size_str = f"{file_size / 1024:.2f} KB"
81
+ else:
82
+ size_str = f"{file_size} bytes"
83
+
84
+ # Format timestamps
85
+ try:
86
+ if created_on != 'Unknown':
87
+ created_formatted = datetime.fromisoformat(created_on.replace('Z', '+00:00')).strftime('%Y-%m-%d %H:%M:%S UTC')
88
+ else:
89
+ created_formatted = 'Unknown'
90
+
91
+ if updated_on != 'Unknown' and updated_on != created_on:
92
+ updated_formatted = datetime.fromisoformat(updated_on.replace('Z', '+00:00')).strftime('%Y-%m-%d %H:%M:%S UTC')
93
+ else:
94
+ updated_formatted = created_formatted
95
+ except:
96
+ created_formatted = str(created_on)
97
+ updated_formatted = str(updated_on)
98
+
99
+ detailed_info += f"### {i}. πŸ“„ **{file_name}**\n"
100
+ detailed_info += f"- **πŸ†” File ID:** `{file_id}`\n"
101
+ detailed_info += f"- **πŸ“ Size:** {size_str}\n"
102
+ detailed_info += f"- **πŸ“… Uploaded:** {created_formatted}\n"
103
+ detailed_info += f"- **πŸ”„ Last Updated:** {updated_formatted}\n"
104
+
105
+ # Try to get metadata if available
106
+ try:
107
+ # Get file details for metadata
108
+ file_details = assistant.describe_file(file_id=file_id)
109
+ if hasattr(file_details, 'metadata') and file_details.metadata:
110
+ metadata = file_details.metadata
111
+ detailed_info += f"- **🏷️ Metadata:**\n"
112
+
113
+ if isinstance(metadata, dict):
114
+ for key, value in metadata.items():
115
+ if isinstance(value, list):
116
+ detailed_info += f" - **{key.title()}:** {', '.join(map(str, value))}\n"
117
+ else:
118
+ detailed_info += f" - **{key.title()}:** {value}\n"
119
+ else:
120
+ detailed_info += f" - {metadata}\n"
121
+ except Exception as metadata_error:
122
+ detailed_info += f"- **🏷️ Metadata:** Could not retrieve metadata\n"
123
+
124
+ detailed_info += "\n---\n\n"
125
+
126
+ except Exception as file_error:
127
+ detailed_info += f"### {i}. ❌ **Error processing file**\n"
128
+ detailed_info += f"- **Error:** {str(file_error)}\n\n---\n\n"
129
+
130
+ progress(1.0, desc="βœ… File list retrieved successfully!")
131
+ time.sleep(0.3)
132
+
133
+ return summary, detailed_info
134
+
135
+ except Exception as e:
136
+ error_msg = f"❌ **Error retrieving file list:** {str(e)}"
137
+ return error_msg, ""
138
+
139
+ def refresh_file_list():
140
+ """Refresh the file list"""
141
+ return "πŸ”„ **Refreshing file list... Please wait**"
142
+
143
+ def process_files_with_progress(files, *metadata_inputs, progress=gr.Progress()):
144
+ """Process multiple files with individual metadata and show progress"""
145
+ if not files:
146
+ return "❌ Error: No files selected", ""
147
+
148
+ if len(files) > 10:
149
+ return "❌ Error: Maximum 10 files allowed at a time", ""
150
+
151
+ try:
152
+ results = []
153
+ errors = []
154
+ total_files = len(files)
155
+
156
+ # Initialize Pinecone Assistant
157
+ progress(0, desc="πŸ”§ Initializing Pinecone Assistant...")
158
+ time.sleep(0.5) # Small delay to show the progress
159
+ assistant_name = os.getenv("PINECONE_ASSISTANT_NAME", "gstminutes")
160
+ assistant = pc.assistant.Assistant(assistant_name=assistant_name)
161
+
162
+ # Process each file with its individual metadata
163
+ for i, file_path in enumerate(files):
164
+ try:
165
+ filename = os.path.basename(file_path)
166
+ progress((i / total_files), desc=f"πŸ“„ Processing {filename}... ({i+1}/{total_files})")
167
+
168
+ # Get metadata for this specific file (3 fields per file: sections, keywords, description)
169
+ sections_idx = i * 3
170
+ keywords_idx = i * 3 + 1
171
+ description_idx = i * 3 + 2
172
+
173
+ if sections_idx < len(metadata_inputs):
174
+ sections = metadata_inputs[sections_idx] or ""
175
+ keywords = metadata_inputs[keywords_idx] or ""
176
+ description = metadata_inputs[description_idx] or ""
177
+ else:
178
+ sections = keywords = description = ""
179
+
180
+ # Skip if no metadata provided for this file
181
+ if not sections.strip() and not keywords.strip() and not description.strip():
182
+ errors.append({
183
+ "filename": filename,
184
+ "error": "❌ Error: No metadata provided"
185
+ })
186
+ continue
187
+
188
+ # Prepare metadata for this file
189
+ progress((i / total_files), desc=f"🏷️ Preparing metadata for {filename}...")
190
+ metadata = {
191
+ "sections": [s.strip() for s in sections.split(",") if s.strip()],
192
+ "keywords": [k.strip() for k in keywords.split(",") if k.strip()],
193
+ "description": description.strip()
194
+ }
195
+
196
+ # Copy to uploads directory
197
+ progress((i / total_files), desc=f"πŸ“ Copying {filename} to uploads...")
198
+ destination_path = os.path.join(UPLOAD_FOLDER, filename)
199
+ shutil.copy2(file_path, destination_path)
200
+
201
+ # Upload to Pinecone Assistant
202
+ progress((i / total_files), desc=f"☁️ Uploading {filename} to Pinecone...")
203
+ response = assistant.upload_file(
204
+ file_path=destination_path,
205
+ metadata=metadata,
206
+ timeout=None
207
+ )
208
+
209
+ results.append({
210
+ "filename": filename,
211
+ "status": "βœ… Success",
212
+ "metadata": metadata,
213
+ "response": str(response)
214
+ })
215
+
216
+ except Exception as file_error:
217
+ errors.append({
218
+ "filename": os.path.basename(file_path),
219
+ "error": f"❌ Error: {str(file_error)}"
220
+ })
221
+
222
+ # Final progress update
223
+ progress(1.0, desc="βœ… Processing complete!")
224
+ time.sleep(0.5)
225
+
226
+ # Format results for display
227
+ success_count = len(results)
228
+ error_count = len(errors)
229
+
230
+ status_message = f"πŸ“Š **Processing Complete**\n\n"
231
+ status_message += f"βœ… **Successful uploads:** {success_count}\n"
232
+ status_message += f"❌ **Failed uploads:** {error_count}\n"
233
+ status_message += f"πŸ“ **Total files processed:** {len(files)}\n\n"
234
+
235
+ # Detailed results
236
+ detailed_results = "## πŸ“‹ **Detailed Results**\n\n"
237
+
238
+ if results:
239
+ detailed_results += "### βœ… **Successful Uploads:**\n"
240
+ for result in results:
241
+ detailed_results += f"- **{result['filename']}**\n"
242
+ detailed_results += f" - Sections: {', '.join(result['metadata']['sections'])}\n"
243
+ detailed_results += f" - Keywords: {', '.join(result['metadata']['keywords'])}\n"
244
+ detailed_results += f" - Description: {result['metadata']['description']}\n\n"
245
+
246
+ if errors:
247
+ detailed_results += "### ❌ **Failed Uploads:**\n"
248
+ for error in errors:
249
+ detailed_results += f"- **{error['filename']}** - {error['error']}\n"
250
+
251
+ return status_message, detailed_results, "βœ… **Processing completed successfully!**"
252
+
253
+ except Exception as e:
254
+ error_msg = f"❌ **Critical Error:** {str(e)}"
255
+ return error_msg, "", "❌ **Processing failed with error**"
256
+
257
+ def update_metadata_fields(files):
258
+ """Update metadata fields based on uploaded files"""
259
+ if not files:
260
+ return [gr.update(visible=False)] * 30 # Hide all fields
261
+
262
+ if len(files) > 10:
263
+ # Show error and hide all fields
264
+ return [gr.update(visible=False)] * 30
265
+
266
+ updates = []
267
+ for i in range(len(files)):
268
+ if i < len(files):
269
+ filename = os.path.basename(files[i])
270
+ # Show 3 fields per file (sections, keywords, description)
271
+ updates.extend([
272
+ gr.update(visible=True, label=f"πŸ“‘ Sections for {filename}", placeholder="e.g., Introduction, Financial Data, Compliance"),
273
+ gr.update(visible=True, label=f"πŸ” Keywords for {filename}", placeholder="e.g., GST, tax, compliance, revenue"),
274
+ gr.update(visible=True, label=f"πŸ“ Description for {filename}", placeholder="Brief description of this document")
275
+ ])
276
+
277
+ # Hide remaining fields
278
+ while len(updates) < 30:
279
+ updates.append(gr.update(visible=False))
280
+
281
+ return updates[:30]
282
+
283
+ def clear_form():
284
+ """Clear all form fields"""
285
+ return [None] + [""] * 30 + ["", "", "🟒 **Ready to process documents**"]
286
+
287
+ def start_processing():
288
+ """Show processing started status"""
289
+ return "πŸ”„ **Processing documents... Please wait**"
290
+
291
+ def finish_processing():
292
+ """Show processing finished status"""
293
+ return "βœ… **Processing completed successfully!**"
294
+
295
+ # Create Gradio interface
296
+ with gr.Blocks(
297
+ title="πŸ“„ Tax Document Ingestion System",
298
+ theme=gr.themes.Soft(),
299
+ css="""
300
+ .gradio-container {
301
+ max-width: 1400px !important;
302
+ margin: auto;
303
+ }
304
+ .upload-container {
305
+ border: 2px dashed #4CAF50;
306
+ border-radius: 10px;
307
+ padding: 20px;
308
+ text-align: center;
309
+ background-color: #f8f9fa;
310
+ }
311
+ .tab-nav {
312
+ margin-bottom: 20px;
313
+ }
314
+ """
315
+ ) as app:
316
+
317
+ gr.Markdown(
318
+ """
319
+ # πŸ“„ Tax Document Ingestion System
320
+
321
+ Upload and manage documents in the Pinecone Assistant for GST Minutes processing.
322
+
323
+ ## πŸš€ Features:
324
+ - βœ… **Multiple file upload** - Select and upload multiple documents at once
325
+ - 🏷️ **Metadata tagging** - Add sections, keywords, and descriptions
326
+ - πŸ”„ **Batch processing** - All files processed with individual metadata
327
+ - πŸ“Š **File management** - View uploaded files with timestamps and metadata
328
+ - πŸ“‹ **Detailed reporting** - See success/failure status for each operation
329
+
330
+ ---
331
+ """
332
+ )
333
+
334
+ # Create tabs for different functionalities
335
+ with gr.Tabs() as tabs:
336
+
337
+ # Tab 1: Upload Documents
338
+ with gr.TabItem("πŸ“€ Upload Documents", id="upload_tab"):
339
+ with gr.Row():
340
+ with gr.Column(scale=1):
341
+ gr.Markdown("### πŸ“ **File Upload**")
342
+ files_input = gr.File(
343
+ label="Select Documents (Max 10 files)",
344
+ file_count="multiple",
345
+ file_types=[".pdf", ".doc", ".docx", ".txt"],
346
+ elem_classes=["upload-container"]
347
+ )
348
+
349
+ with gr.Column(scale=1):
350
+ gr.Markdown("### 🏷️ **Document Metadata (Individual for Each File)**")
351
+ gr.Markdown("*Upload files first, then metadata fields will appear for each document*")
352
+
353
+ # Dynamic metadata fields container
354
+ with gr.Column() as metadata_container:
355
+ # Create 30 text fields (enough for 10 files with 3 fields each)
356
+ metadata_fields = []
357
+ for i in range(30):
358
+ field = gr.Textbox(
359
+ label=f"Field {i}",
360
+ placeholder="",
361
+ visible=False,
362
+ lines=2
363
+ )
364
+ metadata_fields.append(field)
365
+
366
+ with gr.Row():
367
+ with gr.Column(scale=1):
368
+ upload_btn = gr.Button(
369
+ "πŸš€ Upload Documents to Pinecone Assistant",
370
+ variant="primary",
371
+ size="lg"
372
+ )
373
+
374
+ with gr.Column(scale=1):
375
+ clear_btn = gr.Button(
376
+ "πŸ—‘οΈ Clear Form",
377
+ variant="secondary",
378
+ size="lg"
379
+ )
380
+
381
+ # Processing status indicator
382
+ with gr.Row():
383
+ processing_status = gr.Markdown(
384
+ value="🟒 **Ready to process documents**",
385
+ visible=True
386
+ )
387
+
388
+ gr.Markdown("---")
389
+
390
+ # Results section
391
+ with gr.Row():
392
+ with gr.Column():
393
+ status_output = gr.Markdown(
394
+ label="πŸ“Š Upload Status",
395
+ value="*Ready to upload documents...*"
396
+ )
397
+
398
+ with gr.Row():
399
+ with gr.Column():
400
+ results_output = gr.Markdown(
401
+ label="πŸ“‹ Detailed Results",
402
+ value=""
403
+ )
404
+
405
+ # Tab 2: View Uploaded Files
406
+ with gr.TabItem("πŸ“‹ View Uploaded Files", id="view_tab"):
407
+ gr.Markdown("### πŸ“‹ **Uploaded Files Management**")
408
+ gr.Markdown("View all files currently uploaded to the Pinecone Assistant with their metadata and timestamps.")
409
+
410
+ with gr.Row():
411
+ refresh_btn = gr.Button(
412
+ "πŸ”„ Refresh File List",
413
+ variant="primary",
414
+ size="lg"
415
+ )
416
+ auto_refresh_btn = gr.Button(
417
+ "πŸ“‹ Load Files on Startup",
418
+ variant="secondary",
419
+ size="lg"
420
+ )
421
+
422
+ # File list status
423
+ with gr.Row():
424
+ file_list_status = gr.Markdown(
425
+ value="🟑 **Click 'Refresh File List' to load uploaded files**",
426
+ visible=True
427
+ )
428
+
429
+ gr.Markdown("---")
430
+
431
+ # File list results
432
+ with gr.Row():
433
+ with gr.Column(scale=1):
434
+ file_summary = gr.Markdown(
435
+ label="πŸ“Š Files Summary",
436
+ value="*Click refresh to load file summary...*"
437
+ )
438
+
439
+ with gr.Row():
440
+ with gr.Column():
441
+ file_details = gr.Markdown(
442
+ label="πŸ“‹ File Details",
443
+ value="*Click refresh to load file details...*"
444
+ )
445
+
446
+ # Event handlers for Upload tab
447
+
448
+ # Update metadata fields when files are uploaded
449
+ files_input.change(
450
+ fn=update_metadata_fields,
451
+ inputs=[files_input],
452
+ outputs=metadata_fields
453
+ )
454
+
455
+ # Show processing status when upload starts
456
+ upload_btn.click(
457
+ fn=start_processing,
458
+ outputs=[processing_status]
459
+ ).then(
460
+ fn=process_files_with_progress,
461
+ inputs=[files_input] + metadata_fields,
462
+ outputs=[status_output, results_output, processing_status]
463
+ )
464
+
465
+ clear_btn.click(
466
+ fn=clear_form,
467
+ outputs=[files_input] + metadata_fields + [status_output, results_output, processing_status]
468
+ )
469
+
470
+ # Event handlers for View Files tab
471
+
472
+ # Refresh file list
473
+ refresh_btn.click(
474
+ fn=refresh_file_list,
475
+ outputs=[file_list_status]
476
+ ).then(
477
+ fn=list_uploaded_files,
478
+ outputs=[file_summary, file_details]
479
+ )
480
+
481
+ # Auto load files on startup
482
+ auto_refresh_btn.click(
483
+ fn=refresh_file_list,
484
+ outputs=[file_list_status]
485
+ ).then(
486
+ fn=list_uploaded_files,
487
+ outputs=[file_summary, file_details]
488
+ )
489
+
490
+ # Footer
491
+ gr.Markdown(
492
+ """
493
+ ---
494
+
495
+ ### πŸ’‘ **Usage Tips:**
496
+
497
+ **Upload Documents:**
498
+ - Select up to 10 PDF, DOC, DOCX, or TXT files at once
499
+ - Upload files first, then fill individual metadata for each document
500
+ - Each file gets its own sections, keywords, and description
501
+ - Check the results section for upload status
502
+
503
+ **View Uploaded Files:**
504
+ - Click 'Refresh File List' to see all uploaded files
505
+ - View file details including upload timestamps and metadata
506
+ - Files are sorted by most recent first
507
+
508
+ ### πŸ“ž **Support:**
509
+ For issues or questions, contact the development team.
510
+ """
511
+ )
512
+
513
+ if __name__ == "__main__":
514
+ app.launch(
515
+ server_name="0.0.0.0",
516
+ server_port=7860,
517
+ share=False,
518
+ debug=True,
519
+ show_error=True
520
+ )