Alessio Vertemati commited on
Commit
07efe32
·
0 Parent(s):

That's a start

Browse files
Files changed (6) hide show
  1. .gitattributes +35 -0
  2. .gitignore +13 -0
  3. .python-version +1 -0
  4. app.py +432 -0
  5. pyproject.toml +16 -0
  6. uv.lock +0 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+
13
+ .env
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
app.py ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import json
4
+ from typing import List, Tuple, Optional, Dict
5
+ from dataclasses import dataclass, asdict
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+
11
+ # Maximum file size for attachments (10 MB in bytes)
12
+ MAX_ATTACHMENT_SIZE = 10 * 1024 * 1024
13
+
14
+
15
+ @dataclass
16
+ class Attachment:
17
+ """Represents a file attachment with its metadata."""
18
+ filename: str
19
+ filepath: str
20
+ description: str
21
+ size: int
22
+
23
+
24
+ def get_attachments_from_state(attachments_state: List[Dict]) -> List[Attachment]:
25
+ """
26
+ Retrieve the current list of attachments for the session.
27
+
28
+ Args:
29
+ attachments_state: The session state containing attachment dictionaries
30
+
31
+ Returns:
32
+ List of Attachment objects
33
+
34
+ Note:
35
+ Session isolation is handled by Gradio's State management.
36
+ Each user session has its own independent state.
37
+ """
38
+ if not attachments_state:
39
+ return []
40
+
41
+ attachments = []
42
+ for att_dict in attachments_state:
43
+ if att_dict.get('filepath') and os.path.exists(att_dict.get('filepath')):
44
+ attachments.append(Attachment(
45
+ filename=att_dict['filename'],
46
+ filepath=att_dict['filepath'],
47
+ description=att_dict['description'],
48
+ size=att_dict['size']
49
+ ))
50
+
51
+ return attachments
52
+
53
+
54
+ def add_attachment_to_pdf(pdf_path: str, attachment: Attachment) -> bool:
55
+ """
56
+ Add a single attachment to the PDF file.
57
+
58
+ Args:
59
+ pdf_path: Path to the PDF file
60
+ attachment: The Attachment object to add
61
+
62
+ Returns:
63
+ True if successful, False otherwise
64
+
65
+ TODO: Implement the actual PDF attachment logic here
66
+ """
67
+ # ============================================================================
68
+ # PLACEHOLDER: Insert your logic here to attach the file to the PDF
69
+ # ============================================================================
70
+ # Example implementation would:
71
+ # 1. Load the PDF using a library like pypdf, PyMuPDF, or pikepdf
72
+ # 2. Embed the file from attachment.filepath as an attachment
73
+ # 3. Set the attachment description as metadata
74
+ # 4. Save the PDF back to pdf_path
75
+
76
+ print(f"[PLACEHOLDER] Would add attachment to PDF: {attachment.filename}")
77
+ return True
78
+
79
+
80
+ def remove_attachment_from_pdf(pdf_path: str, attachment: Attachment) -> bool:
81
+ """
82
+ Remove an attachment from the PDF file.
83
+
84
+ Args:
85
+ pdf_path: Path to the PDF file
86
+ attachment: The Attachment object to remove
87
+
88
+ Returns:
89
+ True if successful, False otherwise
90
+
91
+ TODO: Implement the actual PDF attachment removal logic here
92
+ """
93
+ # ============================================================================
94
+ # PLACEHOLDER: Insert your logic here to remove the attachment from the PDF
95
+ # ============================================================================
96
+ # Example implementation would:
97
+ # 1. Load the PDF using a library like pypdf, PyMuPDF, or pikepdf
98
+ # 2. Find and remove the embedded file matching attachment.filename
99
+ # 3. Save the PDF back to pdf_path
100
+
101
+ print(f"[PLACEHOLDER] Would remove attachment from PDF: {attachment.filename}")
102
+ return True
103
+
104
+
105
+ def list_pdf_attachments(pdf_path: str) -> List[Attachment]:
106
+ """
107
+ List all attachments currently embedded in the PDF file.
108
+
109
+ Args:
110
+ pdf_path: Path to the PDF file
111
+
112
+ Returns:
113
+ List of Attachment objects found in the PDF
114
+
115
+ TODO: Implement the actual PDF attachment listing logic here
116
+ """
117
+ # ============================================================================
118
+ # PLACEHOLDER: Insert your logic here to list attachments from the PDF
119
+ # ============================================================================
120
+ # Example implementation would:
121
+ # 1. Load the PDF using a library like pypdf, PyMuPDF, or pikepdf
122
+ # 2. Iterate through embedded files
123
+ # 3. Extract filename, description, and size for each
124
+ # 4. Return as list of Attachment objects
125
+
126
+ print(f"[PLACEHOLDER] Would list attachments from PDF: {pdf_path}")
127
+ return []
128
+
129
+
130
+ def attach_files_to_pdf(pdf_path: str, attachments: List[Attachment]) -> str:
131
+ """
132
+ Create a new PDF with all attachments embedded.
133
+
134
+ Args:
135
+ pdf_path: Path to the input PDF file
136
+ attachments: List of Attachment objects to add to the PDF
137
+
138
+ Returns:
139
+ Path to the output PDF file with attachments
140
+
141
+ TODO: Implement the actual PDF attachment logic
142
+ """
143
+ # This is a placeholder - actual implementation would use a library like pypdf or similar
144
+ # to embed files as attachments in the PDF
145
+
146
+ output_path = pdf_path.replace(".pdf", "_with_attachments.pdf")
147
+
148
+ # For now, just copy the original file
149
+ # In a real implementation, this would:
150
+ # 1. Load the PDF
151
+ # 2. Add each attachment with its description as metadata
152
+ # 3. Save the modified PDF
153
+
154
+ print(f"Would attach {len(attachments)} files to {pdf_path}")
155
+ for att in attachments:
156
+ print(f" - {att.filename} ({att.size} bytes): {att.description}")
157
+
158
+ # Placeholder: copy the original PDF
159
+ import shutil
160
+ shutil.copy2(pdf_path, output_path)
161
+
162
+ return output_path
163
+
164
+
165
+ def validate_file_size(file_path: str) -> Tuple[bool, str]:
166
+ """Validate that a file is within the size limit."""
167
+ if not file_path or not os.path.exists(file_path):
168
+ return False, "File does not exist"
169
+
170
+ file_size = os.path.getsize(file_path)
171
+ if file_size > MAX_ATTACHMENT_SIZE:
172
+ size_mb = file_size / (1024 * 1024)
173
+ return False, f"File size ({size_mb:.2f} MB) exceeds 10 MB limit"
174
+
175
+ return True, ""
176
+
177
+
178
+ def process_pdf_with_attachments(
179
+ pdf_file,
180
+ attachments_state: List[Dict]
181
+ ) -> Tuple[Optional[str], str]:
182
+ """
183
+ Process the PDF file and add attachments to it.
184
+
185
+ Args:
186
+ pdf_file: The uploaded PDF file
187
+ attachments_state: List of attachment dictionaries (session-specific)
188
+
189
+ Returns:
190
+ Tuple of (output_file_path, status_message)
191
+
192
+ Note:
193
+ Session isolation is handled by Gradio's State management.
194
+ Each user session has its own independent attachments_state.
195
+ """
196
+ if not pdf_file:
197
+ return None, "Please upload a PDF file"
198
+
199
+ if not attachments_state or len(attachments_state) == 0:
200
+ return None, "Please add at least one attachment"
201
+
202
+ # Get attachments for this session
203
+ attachments = get_attachments_from_state(attachments_state)
204
+
205
+ if not attachments:
206
+ return None, "No valid attachments found"
207
+
208
+ try:
209
+ output_path = attach_files_to_pdf(pdf_file.name, attachments)
210
+ return output_path, f"Successfully processed PDF with {len(attachments)} attachment(s)"
211
+ except Exception as e:
212
+ return None, f"Error processing PDF: {str(e)}"
213
+
214
+
215
+ def add_attachment(
216
+ attachment_file,
217
+ description: str,
218
+ current_attachments: List[Dict]
219
+ ) -> Tuple[List[Dict], str, str]:
220
+ """
221
+ Add a new attachment to the session's list.
222
+
223
+ Args:
224
+ attachment_file: The file to attach
225
+ description: Description of the attachment
226
+ current_attachments: Current session-specific list of attachments
227
+
228
+ Returns:
229
+ Tuple of (updated_attachments_list, attachment_list_html, status_message)
230
+
231
+ Note:
232
+ Session isolation is handled by Gradio's State management.
233
+ Each user session has its own independent current_attachments list.
234
+ This function only updates the in-memory state for this session.
235
+ """
236
+ if not attachment_file:
237
+ return current_attachments, render_attachments_list(current_attachments), "Please select a file to attach"
238
+
239
+ # Validate file size
240
+ is_valid, error_msg = validate_file_size(attachment_file.name)
241
+ if not is_valid:
242
+ return current_attachments, render_attachments_list(current_attachments), f"Error: {error_msg}"
243
+
244
+ if not description or description.strip() == "":
245
+ return current_attachments, render_attachments_list(current_attachments), "Please provide a description for the attachment"
246
+
247
+ # Create new attachment
248
+ filename = os.path.basename(attachment_file.name)
249
+ file_size = os.path.getsize(attachment_file.name)
250
+
251
+ new_attachment = {
252
+ 'filename': filename,
253
+ 'filepath': attachment_file.name,
254
+ 'description': description.strip(),
255
+ 'size': file_size
256
+ }
257
+
258
+ # Add to session-specific list
259
+ if current_attachments is None:
260
+ current_attachments = []
261
+
262
+ current_attachments.append(new_attachment)
263
+
264
+ # ============================================================================
265
+ # PLACEHOLDER: If you want to immediately attach to PDF on add, insert here
266
+ # ============================================================================
267
+ # Example: Uncomment and modify if you want real-time PDF updates
268
+ # if pdf_file_path: # You'd need to pass the PDF path as a parameter
269
+ # attachment_obj = Attachment(
270
+ # filename=filename,
271
+ # filepath=attachment_file.name,
272
+ # description=description.strip(),
273
+ # size=file_size
274
+ # )
275
+ # add_attachment_to_pdf(pdf_file_path, attachment_obj)
276
+
277
+ return (
278
+ current_attachments,
279
+ render_attachments_list(current_attachments),
280
+ f"Added attachment: {filename}"
281
+ )
282
+
283
+
284
+ def remove_attachment(
285
+ index: int,
286
+ current_attachments: List[Dict]
287
+ ) -> Tuple[List[Dict], str]:
288
+ """
289
+ Remove an attachment from the session's list by index.
290
+
291
+ Args:
292
+ index: Index of the attachment to remove
293
+ current_attachments: Current session-specific list of attachments
294
+
295
+ Returns:
296
+ Tuple of (updated_attachments_list, attachment_list_html)
297
+
298
+ Note:
299
+ Session isolation is handled by Gradio's State management.
300
+ Each user session has its own independent current_attachments list.
301
+ This function only updates the in-memory state for this session.
302
+ """
303
+ if current_attachments and 0 <= index < len(current_attachments):
304
+ removed = current_attachments.pop(index)
305
+
306
+ # ============================================================================
307
+ # PLACEHOLDER: If you want to immediately remove from PDF, insert here
308
+ # ============================================================================
309
+ # Example: Uncomment and modify if you want real-time PDF updates
310
+ # if pdf_file_path: # You'd need to pass the PDF path as a parameter
311
+ # attachment_obj = Attachment(
312
+ # filename=removed['filename'],
313
+ # filepath=removed['filepath'],
314
+ # description=removed['description'],
315
+ # size=removed['size']
316
+ # )
317
+ # remove_attachment_from_pdf(pdf_file_path, attachment_obj)
318
+
319
+ return current_attachments, render_attachments_list(current_attachments)
320
+
321
+ return current_attachments, render_attachments_list(current_attachments)
322
+
323
+
324
+ def render_attachments_list(attachments: Optional[List[Dict]]) -> str:
325
+ """Render the list of attachments as HTML."""
326
+ if not attachments or len(attachments) == 0:
327
+ return "<p style='color: #666; font-style: italic;'>No attachments added yet</p>"
328
+
329
+ html = "<div style='font-family: sans-serif;'>"
330
+ for idx, att in enumerate(attachments):
331
+ size_mb = att['size'] / (1024 * 1024)
332
+ html += f"""
333
+ <div style='border: 1px solid #ddd; padding: 12px; margin: 8px 0; border-radius: 6px; background: #f9f9f9;'>
334
+ <div style='display: flex; justify-content: space-between; align-items: start;'>
335
+ <div style='flex: 1;'>
336
+ <strong style='color: #333;'>📎 {att['filename']}</strong>
337
+ <span style='color: #666; font-size: 0.9em;'> ({size_mb:.2f} MB)</span>
338
+ <p style='margin: 8px 0 0 0; color: #555;'>{att['description']}</p>
339
+ </div>
340
+ </div>
341
+ </div>
342
+ """
343
+ html += "</div>"
344
+ return html
345
+
346
+
347
+ def main():
348
+ """Create and launch the Gradio interface."""
349
+
350
+ with gr.Blocks(title="Incorporate data in PDF") as demo:
351
+ gr.Markdown("""
352
+ # Incorporate Data in PDF
353
+
354
+ Upload a PDF file and add attachments with descriptions. The attachments will be embedded in the PDF file.
355
+ """)
356
+
357
+ # State to store attachments
358
+ attachments_state = gr.State([])
359
+
360
+ # 1. File Upload Section (full width at top)
361
+ gr.Markdown("### 1. Upload PDF File")
362
+ pdf_input = gr.File(
363
+ label="Select PDF File",
364
+ file_types=[".pdf"],
365
+ type="filepath"
366
+ )
367
+
368
+ # 2. Middle section with two columns
369
+ with gr.Row():
370
+ # Left column: List of current attachments
371
+ with gr.Column(scale=1):
372
+ gr.Markdown("### 2. Current Attachments")
373
+ attachments_display = gr.HTML(
374
+ value="<p style='color: #666; font-style: italic;'>No attachments added yet</p>"
375
+ )
376
+
377
+ # Right column: Form to add new attachments
378
+ with gr.Column(scale=1):
379
+ gr.Markdown("### 3. Add New Attachment")
380
+ gr.Markdown("*Maximum file size: 10 MB per attachment*")
381
+
382
+ attachment_file = gr.File(
383
+ label="Select File to Attach",
384
+ file_types=None, # Allow any file type
385
+ type="filepath"
386
+ )
387
+
388
+ attachment_description = gr.Textbox(
389
+ label="Attachment Description",
390
+ placeholder="Enter a description for this attachment...",
391
+ lines=2
392
+ )
393
+
394
+ add_btn = gr.Button("Add Attachment", variant="primary")
395
+ add_status = gr.Markdown("")
396
+
397
+ # 3. Download Section (full width at bottom)
398
+ gr.Markdown("### 4. Process and Download")
399
+
400
+ process_btn = gr.Button("Process PDF with Attachments", variant="primary", size="lg")
401
+ process_status = gr.Markdown("")
402
+
403
+ output_file = gr.File(
404
+ label="Download PDF with Attachments",
405
+ visible=False
406
+ )
407
+
408
+ # Event handlers
409
+ add_btn.click(
410
+ fn=add_attachment,
411
+ inputs=[attachment_file, attachment_description, attachments_state],
412
+ outputs=[attachments_state, attachments_display, add_status]
413
+ ).then(
414
+ lambda: (None, ""), # Clear the file input and description
415
+ outputs=[attachment_file, attachment_description]
416
+ )
417
+
418
+ process_btn.click(
419
+ fn=process_pdf_with_attachments,
420
+ inputs=[pdf_input, attachments_state],
421
+ outputs=[output_file, process_status]
422
+ ).then(
423
+ lambda x: gr.File(visible=x is not None),
424
+ inputs=[output_file],
425
+ outputs=[output_file]
426
+ )
427
+
428
+ demo.launch(theme=gr.themes.Soft())
429
+
430
+
431
+ if __name__ == "__main__":
432
+ main()
pyproject.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "pdf-incorporate"
3
+ version = "0.1.0"
4
+ description = "Incorporate data in machine readable format within PDF files"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "gradio[mcp]>=6.2.0",
9
+ "requests>=2.28",
10
+ "python-dotenv>=1.2.1",
11
+ "tiktoken>=0.12.0",
12
+ "parxy",
13
+ ]
14
+
15
+ [tool.uv.sources]
16
+ parxy = { git = "https://github.com/OneOffTech/parxy/", branch = "refactor-pdf-helpers" }
uv.lock ADDED
The diff for this file is too large to render. See raw diff