pragnesh002 commited on
Commit
c71e98d
·
1 Parent(s): c089396

Fix runtime error

Browse files
Files changed (3) hide show
  1. app.py +610 -543
  2. requirements.txt +19 -6
  3. temp_image.py +906 -0
app.py CHANGED
@@ -1,603 +1,670 @@
1
- import gradio as gr
2
- import torch
3
- import fitz # PyMuPDF
4
- import json
5
- import pandas as pd
6
- import os
7
- import re
8
- import xlsxwriter
9
- from PIL import Image
10
- import io
11
- from collections import defaultdict
12
- import numpy as np
13
- import zipfile
14
- from transformers import AutoTokenizer, AutoModelForCausalLM
15
- import warnings
16
- import shutil
17
- import tempfile
18
- import gc
19
- warnings.filterwarnings("ignore")
20
-
21
- # Global variables to store model and tokenizer
22
- MODEL = None
23
- TOKENIZER = None
24
-
25
- def load_model_once():
26
- """Load GGUF model once and keep in memory - Ultra-optimized for CPU"""
27
- global MODEL, TOKENIZER
28
 
29
- if MODEL is not None and TOKENIZER is not None:
30
- print("✅ GGUF Model already loaded in memory")
31
- return MODEL, TOKENIZER
32
 
33
- try:
34
- print("🔄 Loading GGUF model (CPU-optimized)...")
35
 
36
- # GGUF model configurations (choose one)
37
- gguf_models = {
38
- "q4_k_m": "pragnesh002/Qwen3-4B-Product-Extractor-GGUF-Q4-K-M", # Recommended
39
- # "q5_k_m": "your_username/Qwen3-4B-Product-Extractor-GGUF-Q5-K-M", # Better quality
40
- # "q8_0": "your_username/Qwen3-4B-Product-Extractor-GGUF-Q8", # High quality
41
- }
42
 
43
- # Use Q4_K_M by default (best balance of speed/quality/size)
44
- model_name = gguf_models["q4_k_m"]
45
 
46
- print(f"Loading GGUF model Change: {model_name}")
47
 
48
- # Load tokenizer
49
- TOKENIZER = AutoTokenizer.from_pretrained(
50
- model_name,
51
- trust_remote_code=True,
52
- use_fast=True # Use fast tokenizer for better performance
53
- )
54
-
55
- print(f"Loaded GGUF TOKENIZER")
56
 
57
- # Load GGUF model with CPU optimizations
58
- MODEL = AutoModelForCausalLM.from_pretrained(
59
- model_name,
60
- device_map="cpu",
61
- trust_remote_code=True,
62
- low_cpu_mem_usage=True,
63
- torch_dtype=torch.float32,
64
- use_cache=True,
65
- cache_dir="/tmp/gguf_cache"
66
- )
67
-
68
- print(f"Loaded GGUF MODEL")
69
 
70
- # Set to evaluation mode
71
- MODEL.eval()
72
 
73
- # CPU optimizations for GGUF
74
- torch.set_num_threads(4) # Optimal for GGUF models
75
- torch.set_num_interop_threads(2)
76
 
77
- print("✅ GGUF Model loaded successfully on CPU!")
78
- print(f"Model type: GGUF Q4_K_M Quantized")
79
- print(f"Memory footprint: ~2.5GB (vs ~8GB for full model)")
80
- print(f"CPU threads: {torch.get_num_threads()}")
81
 
82
- return MODEL, TOKENIZER
83
 
84
- except Exception as e:
85
- print(f"❌ Error loading GGUF model: {e}")
86
- print("Falling back to regular model loading...")
87
 
88
- # Fallback to regular model if GGUF fails
89
- try:
90
- fallback_model = "pragnesh002/Qwen3-4B-Product-Extractor-GGUF-Q4-K-M"
91
- TOKENIZER = AutoTokenizer.from_pretrained(fallback_model)
92
- MODEL = AutoModelForCausalLM.from_pretrained(
93
- fallback_model,
94
- device_map="cpu",
95
- torch_dtype=torch.float32,
96
- low_cpu_mem_usage=True
97
- )
98
- print("✅ Fallback model loaded")
99
- return MODEL, TOKENIZER
100
- except:
101
- return None, None
102
-
103
- class ProductImageExtractor:
104
- def __init__(self):
105
- # Create temporary directory for images
106
- self.temp_dir = tempfile.mkdtemp(prefix="pdf_extractor_")
107
- self.image_save_dir = os.path.join(self.temp_dir, "extracted_product_images")
108
- self.model = None
109
- self.tokenizer = None
110
- self.setup_directories()
111
- self.load_model()
112
-
113
- def load_model(self):
114
- """Load the pre-loaded model"""
115
- self.model, self.tokenizer = load_model_once()
116
- if self.model is None:
117
- raise Exception("Failed to load model")
118
-
119
- def setup_directories(self):
120
- """Create necessary directories in temp location"""
121
- os.makedirs(self.image_save_dir, exist_ok=True)
122
- os.makedirs(f"{self.image_save_dir}/product_images", exist_ok=True)
123
- os.makedirs(f"{self.image_save_dir}/non_product_images", exist_ok=True)
124
-
125
- def cleanup_temp_files(self):
126
- """Clean up temporary image files"""
127
- try:
128
- if os.path.exists(self.temp_dir):
129
- shutil.rmtree(self.temp_dir)
130
- print(f"🧹 Cleaned up temporary files: {self.temp_dir}")
131
- except Exception as e:
132
- print(f"Warning: Could not clean up temp files: {e}")
133
-
134
- def generate_text(self, prompt):
135
- """Generate text using the cached model - CPU optimized"""
136
- if self.model is None or self.tokenizer is None:
137
- return "Error: Model not loaded"
138
 
139
- try:
140
- # CPU-optimized tokenization
141
- inputs = self.tokenizer.encode(
142
- prompt,
143
- return_tensors="pt",
144
- truncation=True,
145
- max_length=1024 # Limit input length for CPU
146
- )
147
 
148
- # Generate with CPU-optimized settings
149
- with torch.no_grad():
150
- outputs = self.model.generate(
151
- inputs,
152
- max_new_tokens=512, # Reduced for CPU
153
- temperature=0.1,
154
- do_sample=False, # Greedy decoding for CPU (faster)
155
- pad_token_id=self.tokenizer.eos_token_id,
156
- eos_token_id=self.tokenizer.eos_token_id,
157
- use_cache=True,
158
- num_beams=1, # No beam search (faster on CPU)
159
- )
160
 
161
- # Decode response
162
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
163
 
164
- # Extract only the generated part
165
- prompt_length = len(self.tokenizer.decode(inputs[0], skip_special_tokens=True))
166
- response = response[prompt_length:].strip()
167
 
168
- # Force garbage collection to free memory
169
- del inputs, outputs
170
- gc.collect()
171
 
172
- return response
173
 
174
- except Exception as e:
175
- return f"Error in generation: {e}"
176
-
177
- def is_product_related_image(self, image_bbox, text_blocks, page_text):
178
- """Determine if an image is product-related"""
179
- product_code_pattern = r'\b[A-Z]{2}-[A-Z]{2}\d+[a-z]?\b'
180
- product_codes = re.findall(product_code_pattern, page_text)
181
-
182
- if not product_codes:
183
- return False, None, 0.0
184
-
185
- product_text_blocks = []
186
- for block in text_blocks:
187
- if len(block) < 5:
188
- continue
189
- block_text = block[4]
190
- if any(code in block_text for code in product_codes):
191
- product_text_blocks.append({
192
- 'bbox': block[:4],
193
- 'text': block_text,
194
- 'codes': [code for code in product_codes if code in block_text]
195
- })
196
-
197
- if not product_text_blocks:
198
- return False, None, 0.0
199
-
200
- max_proximity_score = 0.0
201
- closest_product_code = None
202
-
203
- for block in product_text_blocks:
204
- proximity_score = self.calculate_proximity_score(image_bbox, block['bbox'])
205
- if proximity_score > max_proximity_score:
206
- max_proximity_score = proximity_score
207
- closest_product_code = block['codes'][0] if block['codes'] else None
208
-
209
- is_product = self.additional_filters(image_bbox, max_proximity_score)
210
- return is_product, closest_product_code, max_proximity_score
211
-
212
- def additional_filters(self, image_bbox, max_proximity_score):
213
- """Apply additional filters for image classification"""
214
- image_area = (image_bbox[2] - image_bbox[0]) * (image_bbox[3] - image_bbox[1])
215
- if image_area < 3000:
216
- return False
217
- page_height = 842
218
- if image_bbox[1] < 80 or image_bbox[3] > page_height - 80:
219
- return False
220
- return max_proximity_score > 0.2
221
-
222
- def calculate_proximity_score(self, image_bbox, text_bbox):
223
- """Calculate proximity score between image and text"""
224
- img_center_x = (image_bbox[0] + image_bbox[2]) / 2
225
- img_center_y = (image_bbox[1] + image_bbox[3]) / 2
226
- text_center_x = (text_bbox[0] + text_bbox[2]) / 2
227
- text_center_y = (text_bbox[1] + text_bbox[3]) / 2
228
- distance = ((img_center_x - text_center_x) ** 2 + (img_center_y - text_center_y) ** 2) ** 0.5
229
- proximity_score = max(0, 1 - (distance / 800))
230
- return proximity_score
231
-
232
- def extract_and_classify_images(self, page, page_num, doc):
233
- """Extract and classify images from page"""
234
- images = page.get_images(full=True)
235
- text_blocks = page.get_text("blocks")
236
- page_text = page.get_text()
237
-
238
- product_images = []
239
-
240
- for img_index, img_info in enumerate(images):
241
- xref = img_info[0]
242
- try:
243
- image_list = page.get_image_rects(xref)
244
- if not image_list:
245
- continue
246
-
247
- image_bbox = image_list[0]
248
- is_product, product_code, proximity_score = self.is_product_related_image(
249
- image_bbox, text_blocks, page_text
250
- )
251
-
252
- if is_product and product_code:
253
- pix = fitz.Pixmap(doc, xref)
254
- if pix.n - pix.alpha > 3:
255
- pix = fitz.Pixmap(fitz.csRGB, pix)
256
-
257
- filename = f"page{page_num}_{product_code}_img{img_index+1}.png"
258
- image_path = os.path.join(self.image_save_dir, "product_images", filename)
259
- pix.save(image_path)
260
-
261
- image_data = {
262
- 'path': image_path,
263
- 'product_code': product_code,
264
- 'proximity_score': proximity_score
265
- }
266
- product_images.append(image_data)
267
- pix = None
268
-
269
- except Exception as e:
270
- print(f"Error extracting image {img_index+1}: {e}")
271
-
272
- return product_images
273
-
274
- def extract_product_data_with_images(self, pdf_file):
275
- """Main extraction function with automatic cleanup"""
276
- try:
277
- doc = fitz.open(pdf_file.name)
278
- total_pages = min(doc.page_count, 10) # Limit to 10 pages for CPU
279
- print(f"Processing {total_pages} pages on CPU...")
280
- except Exception as e:
281
- return None, f"Error opening PDF: {e}"
282
-
283
- all_product_images = {}
284
- product_data_tracker = {}
285
-
286
- system_prompt = """You are a data extraction assistant.
287
- Extract the item details from the provided text.
288
- Provide the output as a JSON object, where each object represents an item and has the following keys: 'Flag', 'Product Code', 'Description', 'Manufacturer', 'Supplier', 'Material', 'Dimensions', and 'Product Image'.
289
- If a key's value is not found in the text for an item, provide an empty string "".
290
- If no items are found, return an empty JSON [].
291
- Do not include any extra text or formatting outside the JSON object.
292
- Include rows with unique Product Code values only."""
293
-
294
- for page_num in range(total_pages):
295
- page = doc.load_page(page_num)
296
- page_text = page.get_text()
297
-
298
- if len(page_text.strip()) < 50: # Skip mostly empty pages
299
- continue
300
-
301
- print(f"Processing page {page_num + 1}...")
302
-
303
- # Extract images
304
- product_images = self.extract_and_classify_images(page, page_num + 1, doc)
305
- for img_data in product_images:
306
- if img_data['product_code']:
307
- if img_data['product_code'] not in all_product_images:
308
- all_product_images[img_data['product_code']] = []
309
- all_product_images[img_data['product_code']].append(img_data)
310
-
311
- # Extract text data (CPU-optimized processing)
312
- prompt = f"{system_prompt}\n\nText:\n{page_text[:2000]}\n\nOutput JSON:" # Limit text length
313
- raw_output = self.generate_text(prompt)
314
-
315
- try:
316
- # Parse JSON response
317
- json_start = raw_output.find('[')
318
- json_end = raw_output.rfind(']') + 1
319
 
320
- if json_start != -1 and json_end != 0:
321
- json_str = raw_output[json_start:json_end]
322
- else:
323
- json_str = raw_output.strip()
324
-
325
- parsed_data = json.loads(json_str)
326
- if isinstance(parsed_data, dict):
327
- parsed_data = [parsed_data]
328
- elif not isinstance(parsed_data, list):
329
- parsed_data = []
330
-
331
- for item in parsed_data:
332
- if isinstance(item, dict):
333
- product_code = item.get('Product Code', '').strip()
334
- if not product_code:
335
- continue
336
-
337
- # Find best matching image
338
- image_path = ""
339
- if product_code in all_product_images:
340
- best_image = max(all_product_images[product_code],
341
- key=lambda x: x['proximity_score'])
342
- image_path = best_image['path']
343
-
344
- current_item_data = {
345
- "pdf_page_number": page_num + 1,
346
- "Flag": item.get('Flag', ''),
347
- "Product Code": product_code,
348
- "Description": item.get('Description', ''),
349
- "Manufacturer": item.get('Manufacturer', ''),
350
- "Supplier": item.get('Supplier', ''),
351
- "Material": item.get('Material', ''),
352
- "Dimensions": item.get('Dimensions', ''),
353
- "Product Image": item.get('Product Image', ''),
354
- "Product Image File": image_path,
355
- }
356
-
357
- if product_code not in product_data_tracker:
358
- product_data_tracker[product_code] = current_item_data
359
-
360
- except Exception as e:
361
- print(f"Error processing page {page_num + 1}: {e}")
362
-
363
- doc.close()
364
- final_data = list(product_data_tracker.values())
365
- return final_data, None
366
-
367
- def create_excel_with_images_and_cleanup(data, extractor, output_filename="product_data_with_images.xlsx"):
368
- """Create Excel file with embedded images, then clean up image files"""
369
- if not data:
370
- return None
371
 
372
- df = pd.DataFrame(data)
373
 
374
- try:
375
- with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
376
- df.to_excel(writer, sheet_name='Product Data', index=False)
377
 
378
- workbook = writer.book
379
- worksheet = writer.sheets['Product Data']
380
 
381
- # Set column widths
382
- for col_idx, column_name in enumerate(df.columns):
383
- if column_name == "Product Image":
384
- worksheet.set_column(col_idx, col_idx, 20)
385
- elif column_name in ["Description", "Material"]:
386
- worksheet.set_column(col_idx, col_idx, 30)
387
- else:
388
- worksheet.set_column(col_idx, col_idx, 15)
389
 
390
- # Add header formatting
391
- header_format = workbook.add_format({
392
- 'bold': True,
393
- 'text_wrap': True,
394
- 'valign': 'top',
395
- 'fg_color': '#D7E4BC',
396
- 'border': 1
397
- })
398
 
399
- for col_num, value in enumerate(df.columns.values):
400
- worksheet.write(0, col_num, value, header_format)
401
 
402
- # Embed images
403
- try:
404
- image_col_index = df.columns.get_loc("Product Image")
405
 
406
- for row_num in range(1, len(df) + 1):
407
- image_path = df.iloc[row_num - 1]['Product Image File']
408
 
409
- if image_path and os.path.exists(image_path):
410
- try:
411
- worksheet.set_row(row_num, 80)
412
- worksheet.insert_image(
413
- row_num, image_col_index, image_path,
414
- {'x_scale': 0.2, 'y_scale': 0.2, 'x_offset': 5, 'y_offset': 5}
415
- )
416
- except Exception as e:
417
- print(f"Error embedding image: {e}")
418
 
419
- except KeyError:
420
- pass
421
 
422
- print("✅ Excel file created successfully")
423
 
424
- # Now clean up temporary image files
425
- extractor.cleanup_temp_files()
426
- print("🧹 Temporary image files cleaned up")
427
 
428
- # Also clean up the "Product Image File" column data to show cleanup
429
- df_clean = df.copy()
430
- df_clean['Product Image File'] = df_clean['Product Image File'].apply(
431
- lambda x: "✅ Embedded in Excel (temp files cleaned)" if x else ""
432
- )
433
 
434
- return output_filename, df_clean
435
 
436
- except Exception as e:
437
- print(f"Error creating Excel: {e}")
438
- # Still try to cleanup on error
439
- extractor.cleanup_temp_files()
440
- return None, df
441
-
442
- def process_pdf(pdf_file, progress=gr.Progress()):
443
- """Main processing function with automatic cleanup"""
444
- if pdf_file is None:
445
- return "Please upload a PDF file", None, None
446
 
447
- progress(0.1, desc="Initializing CPU-optimized extractor...")
448
 
449
- extractor = None
450
- try:
451
- extractor = ProductImageExtractor()
452
- except Exception as e:
453
- return f"Error initializing extractor: {e}", None, None
454
 
455
- progress(0.3, desc="Extracting data from PDF (CPU mode - may take 2-3 minutes)...")
456
- extracted_data, error = extractor.extract_product_data_with_images(pdf_file)
457
 
458
- if error:
459
- if extractor:
460
- extractor.cleanup_temp_files()
461
- return f"Error: {error}", None, None
462
 
463
- if not extracted_data:
464
- if extractor:
465
- extractor.cleanup_temp_files()
466
- return "No product data found in the PDF", None, None
467
 
468
- progress(0.7, desc="Creating Excel file and embedding images...")
469
- excel_file, df_clean = create_excel_with_images_and_cleanup(extracted_data, extractor)
470
 
471
- if excel_file is None:
472
- return "Error creating Excel file", pd.DataFrame(extracted_data), None
473
 
474
- progress(0.9, desc="Finalizing and cleaning up...")
475
 
476
- summary = f"""
477
- **✅ Extraction Completed Successfully!**
478
-
479
- **📊 Results:**
480
- - **Total items extracted:** {len(df_clean)}
481
- - **Items with product codes:** {len(df_clean[df_clean['Product Code'] != ''])}
482
- - **Items with images:** {len([x for x in extracted_data if x['Product Image File']])}
483
- - **Unique products:** {len(df_clean[df_clean['Product Code'] != '']['Product Code'].unique()) if len(df_clean[df_clean['Product Code'] != '']) > 0 else 0}
484
-
485
- **💻 CPU Processing:**
486
- - **Mode:** CPU-optimized inference
487
- - **Pages processed:** {df_clean['pdf_page_number'].max() if 'pdf_page_number' in df_clean.columns else 'N/A'}
488
- - **Images:** Embedded in Excel, temporary files cleaned up ✅
489
-
490
- **📥 Ready for Download!**
491
- """
492
 
493
- progress(1.0, desc="Complete!")
494
- return summary, df_clean, excel_file
495
-
496
- # Pre-load the model
497
- print("🚀 Initializing PDF Product Extractor (CPU Mode)...")
498
- print("Loading model into memory...")
499
-
500
- model, tokenizer = load_model_once()
501
- if model is None:
502
- print("❌ Failed to load model during startup")
503
- else:
504
- print("✅ Model successfully loaded and cached on CPU!")
505
-
506
- # Create Gradio interface
507
- with gr.Blocks(
508
- title="PDF Product Data Extractor - CPU Optimized",
509
- theme=gr.themes.Soft(),
510
- ) as demo:
511
 
512
- gr.HTML("""
513
- <div style="text-align: center; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; padding: 2rem; border-radius: 10px; margin-bottom: 2rem;">
514
- <h1>📄 PDF Product Data Extractor</h1>
515
- <p>🖥️ CPU-Optimized | 🧹 Auto-Cleanup | 📊 Memory Efficient</p>
516
- </div>
517
- """)
518
 
519
- gr.Markdown("""
520
- ### ⚡ **CPU-Optimized Features:**
521
- - **No GPU Required**: Runs efficiently on CPU-only environments
522
- - **Memory Efficient**: Automatic cleanup of temporary files
523
- - **Cost Effective**: Perfect for free Hugging Face Spaces
524
- - **Smart Processing**: Limited to 10 pages for optimal performance
525
 
526
- ### 🧹 **Automatic Cleanup:**
527
- - Images are temporarily extracted for processing
528
- - Embedded into Excel file during creation
529
- - All temporary image files automatically deleted
530
- - Keeps only the final Excel with embedded images
531
- """)
532
 
533
- with gr.Row():
534
- with gr.Column(scale=1):
535
- pdf_input = gr.File(
536
- label="📁 Upload PDF File",
537
- file_types=[".pdf"],
538
- file_count="single",
539
- height=120
540
- )
541
 
542
- extract_btn = gr.Button(
543
- "🔍 Extract Product Data (CPU Mode)",
544
- variant="primary",
545
- size="lg"
546
- )
547
 
548
- gr.Markdown("""
549
- **💡 CPU Mode Notes:**
550
- - Processing takes 2-3 minutes (vs 30 seconds on GPU)
551
- - Limited to 10 pages per PDF
552
- - Uses 4 CPU threads for stability
553
- - Temporary files auto-cleaned after Excel creation
554
- """)
555
 
556
- with gr.Column(scale=2):
557
- status_output = gr.Markdown(
558
- value="🖥️ CPU mode ready. Upload your PDF to begin processing..."
559
- )
560
 
561
- with gr.Row():
562
- with gr.Column():
563
- data_output = gr.Dataframe(
564
- label="📋 Extracted Product Data",
565
- wrap=True,
566
- interactive=False
567
- )
568
 
569
- with gr.Column():
570
- excel_output = gr.File(
571
- label="📥 Download Excel File",
572
- file_count="single"
573
- )
574
 
575
- extract_btn.click(
576
- fn=process_pdf,
577
- inputs=[pdf_input],
578
- outputs=[status_output, data_output, excel_output],
579
- show_progress=True
580
- )
581
 
582
- gr.Markdown("""
583
- ---
584
- **🔧 Technical Details:**
585
- - **Model**: Fine-tuned Qwen3-4B (CPU-optimized)
586
- - **Processing**: torch.float32, greedy decoding
587
- - **Memory**: Auto garbage collection, temp file cleanup
588
- - **Threads**: Limited to 4 CPU threads for stability
589
 
590
- **🧹 Cleanup Process:**
591
- 1. Images extracted to temporary directory
592
- 2. Data processed and Excel created with embedded images
593
- 3. Temporary image files automatically deleted
594
- 4. Only final Excel file retained with embedded images
595
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
  if __name__ == "__main__":
598
- demo.launch(
599
- server_name="0.0.0.0",
600
- server_port=7860,
601
- share=False,
602
- show_error=True
603
- )
 
1
+ # import gradio as gr
2
+ # import torch
3
+ # import fitz # PyMuPDF
4
+ # import json
5
+ # import pandas as pd
6
+ # import os
7
+ # import re
8
+ # import xlsxwriter
9
+ # from PIL import Image
10
+ # import io
11
+ # from collections import defaultdict
12
+ # import numpy as np
13
+ # import zipfile
14
+ # from transformers import AutoTokenizer, AutoModelForCausalLM
15
+ # import warnings
16
+ # import shutil
17
+ # import tempfile
18
+ # import gc
19
+ # warnings.filterwarnings("ignore")
20
+
21
+ # # Global variables to store model and tokenizer
22
+ # MODEL = None
23
+ # TOKENIZER = None
24
+
25
+ # def load_model_once():
26
+ # """Load GGUF model once and keep in memory - Ultra-optimized for CPU"""
27
+ # global MODEL, TOKENIZER
28
 
29
+ # if MODEL is not None and TOKENIZER is not None:
30
+ # print("✅ GGUF Model already loaded in memory")
31
+ # return MODEL, TOKENIZER
32
 
33
+ # try:
34
+ # print("🔄 Loading GGUF model (CPU-optimized)...")
35
 
36
+ # # GGUF model configurations (choose one)
37
+ # gguf_models = {
38
+ # "q4_k_m": "pragnesh002/Qwen3-4B-Product-Extractor-GGUF-Q4-K-M", # Recommended
39
+ # # "q5_k_m": "your_username/Qwen3-4B-Product-Extractor-GGUF-Q5-K-M", # Better quality
40
+ # # "q8_0": "your_username/Qwen3-4B-Product-Extractor-GGUF-Q8", # High quality
41
+ # }
42
 
43
+ # # Use Q4_K_M by default (best balance of speed/quality/size)
44
+ # model_name = gguf_models["q4_k_m"]
45
 
46
+ # print(f"Loading GGUF model Change: {model_name}")
47
 
48
+ # # Load tokenizer
49
+ # TOKENIZER = AutoTokenizer.from_pretrained(
50
+ # model_name,
51
+ # trust_remote_code=True,
52
+ # use_fast=True # Use fast tokenizer for better performance
53
+ # )
54
+
55
+ # print(f"Loaded GGUF TOKENIZER")
56
 
57
+ # # Load GGUF model with CPU optimizations
58
+ # MODEL = AutoModelForCausalLM.from_pretrained(
59
+ # model_name,
60
+ # device_map="cpu",
61
+ # trust_remote_code=True,
62
+ # low_cpu_mem_usage=True,
63
+ # torch_dtype=torch.float32,
64
+ # use_cache=True,
65
+ # cache_dir="/tmp/gguf_cache"
66
+ # )
67
+
68
+ # print(f"Loaded GGUF MODEL")
69
 
70
+ # # Set to evaluation mode
71
+ # MODEL.eval()
72
 
73
+ # # CPU optimizations for GGUF
74
+ # torch.set_num_threads(4) # Optimal for GGUF models
75
+ # torch.set_num_interop_threads(2)
76
 
77
+ # print("✅ GGUF Model loaded successfully on CPU!")
78
+ # print(f"Model type: GGUF Q4_K_M Quantized")
79
+ # print(f"Memory footprint: ~2.5GB (vs ~8GB for full model)")
80
+ # print(f"CPU threads: {torch.get_num_threads()}")
81
 
82
+ # return MODEL, TOKENIZER
83
 
84
+ # except Exception as e:
85
+ # print(f"❌ Error loading GGUF model: {e}")
86
+ # print("Falling back to regular model loading...")
87
 
88
+ # # Fallback to regular model if GGUF fails
89
+ # try:
90
+ # fallback_model = "pragnesh002/Qwen3-4B-Product-Extractor-GGUF-Q4-K-M"
91
+ # TOKENIZER = AutoTokenizer.from_pretrained(fallback_model)
92
+ # MODEL = AutoModelForCausalLM.from_pretrained(
93
+ # fallback_model,
94
+ # device_map="cpu",
95
+ # torch_dtype=torch.float32,
96
+ # low_cpu_mem_usage=True
97
+ # )
98
+ # print("✅ Fallback model loaded")
99
+ # return MODEL, TOKENIZER
100
+ # except:
101
+ # return None, None
102
+
103
+ # class ProductImageExtractor:
104
+ # def __init__(self):
105
+ # # Create temporary directory for images
106
+ # self.temp_dir = tempfile.mkdtemp(prefix="pdf_extractor_")
107
+ # self.image_save_dir = os.path.join(self.temp_dir, "extracted_product_images")
108
+ # self.model = None
109
+ # self.tokenizer = None
110
+ # self.setup_directories()
111
+ # self.load_model()
112
+
113
+ # def load_model(self):
114
+ # """Load the pre-loaded model"""
115
+ # self.model, self.tokenizer = load_model_once()
116
+ # if self.model is None:
117
+ # raise Exception("Failed to load model")
118
+
119
+ # def setup_directories(self):
120
+ # """Create necessary directories in temp location"""
121
+ # os.makedirs(self.image_save_dir, exist_ok=True)
122
+ # os.makedirs(f"{self.image_save_dir}/product_images", exist_ok=True)
123
+ # os.makedirs(f"{self.image_save_dir}/non_product_images", exist_ok=True)
124
+
125
+ # def cleanup_temp_files(self):
126
+ # """Clean up temporary image files"""
127
+ # try:
128
+ # if os.path.exists(self.temp_dir):
129
+ # shutil.rmtree(self.temp_dir)
130
+ # print(f"🧹 Cleaned up temporary files: {self.temp_dir}")
131
+ # except Exception as e:
132
+ # print(f"Warning: Could not clean up temp files: {e}")
133
+
134
+ # def generate_text(self, prompt):
135
+ # """Generate text using the cached model - CPU optimized"""
136
+ # if self.model is None or self.tokenizer is None:
137
+ # return "Error: Model not loaded"
138
 
139
+ # try:
140
+ # # CPU-optimized tokenization
141
+ # inputs = self.tokenizer.encode(
142
+ # prompt,
143
+ # return_tensors="pt",
144
+ # truncation=True,
145
+ # max_length=1024 # Limit input length for CPU
146
+ # )
147
 
148
+ # # Generate with CPU-optimized settings
149
+ # with torch.no_grad():
150
+ # outputs = self.model.generate(
151
+ # inputs,
152
+ # max_new_tokens=512, # Reduced for CPU
153
+ # temperature=0.1,
154
+ # do_sample=False, # Greedy decoding for CPU (faster)
155
+ # pad_token_id=self.tokenizer.eos_token_id,
156
+ # eos_token_id=self.tokenizer.eos_token_id,
157
+ # use_cache=True,
158
+ # num_beams=1, # No beam search (faster on CPU)
159
+ # )
160
 
161
+ # # Decode response
162
+ # response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
163
 
164
+ # # Extract only the generated part
165
+ # prompt_length = len(self.tokenizer.decode(inputs[0], skip_special_tokens=True))
166
+ # response = response[prompt_length:].strip()
167
 
168
+ # # Force garbage collection to free memory
169
+ # del inputs, outputs
170
+ # gc.collect()
171
 
172
+ # return response
173
 
174
+ # except Exception as e:
175
+ # return f"Error in generation: {e}"
176
+
177
+ # def is_product_related_image(self, image_bbox, text_blocks, page_text):
178
+ # """Determine if an image is product-related"""
179
+ # product_code_pattern = r'\b[A-Z]{2}-[A-Z]{2}\d+[a-z]?\b'
180
+ # product_codes = re.findall(product_code_pattern, page_text)
181
+
182
+ # if not product_codes:
183
+ # return False, None, 0.0
184
+
185
+ # product_text_blocks = []
186
+ # for block in text_blocks:
187
+ # if len(block) < 5:
188
+ # continue
189
+ # block_text = block[4]
190
+ # if any(code in block_text for code in product_codes):
191
+ # product_text_blocks.append({
192
+ # 'bbox': block[:4],
193
+ # 'text': block_text,
194
+ # 'codes': [code for code in product_codes if code in block_text]
195
+ # })
196
+
197
+ # if not product_text_blocks:
198
+ # return False, None, 0.0
199
+
200
+ # max_proximity_score = 0.0
201
+ # closest_product_code = None
202
+
203
+ # for block in product_text_blocks:
204
+ # proximity_score = self.calculate_proximity_score(image_bbox, block['bbox'])
205
+ # if proximity_score > max_proximity_score:
206
+ # max_proximity_score = proximity_score
207
+ # closest_product_code = block['codes'][0] if block['codes'] else None
208
+
209
+ # is_product = self.additional_filters(image_bbox, max_proximity_score)
210
+ # return is_product, closest_product_code, max_proximity_score
211
+
212
+ # def additional_filters(self, image_bbox, max_proximity_score):
213
+ # """Apply additional filters for image classification"""
214
+ # image_area = (image_bbox[2] - image_bbox[0]) * (image_bbox[3] - image_bbox[1])
215
+ # if image_area < 3000:
216
+ # return False
217
+ # page_height = 842
218
+ # if image_bbox[1] < 80 or image_bbox[3] > page_height - 80:
219
+ # return False
220
+ # return max_proximity_score > 0.2
221
+
222
+ # def calculate_proximity_score(self, image_bbox, text_bbox):
223
+ # """Calculate proximity score between image and text"""
224
+ # img_center_x = (image_bbox[0] + image_bbox[2]) / 2
225
+ # img_center_y = (image_bbox[1] + image_bbox[3]) / 2
226
+ # text_center_x = (text_bbox[0] + text_bbox[2]) / 2
227
+ # text_center_y = (text_bbox[1] + text_bbox[3]) / 2
228
+ # distance = ((img_center_x - text_center_x) ** 2 + (img_center_y - text_center_y) ** 2) ** 0.5
229
+ # proximity_score = max(0, 1 - (distance / 800))
230
+ # return proximity_score
231
+
232
+ # def extract_and_classify_images(self, page, page_num, doc):
233
+ # """Extract and classify images from page"""
234
+ # images = page.get_images(full=True)
235
+ # text_blocks = page.get_text("blocks")
236
+ # page_text = page.get_text()
237
+
238
+ # product_images = []
239
+
240
+ # for img_index, img_info in enumerate(images):
241
+ # xref = img_info[0]
242
+ # try:
243
+ # image_list = page.get_image_rects(xref)
244
+ # if not image_list:
245
+ # continue
246
+
247
+ # image_bbox = image_list[0]
248
+ # is_product, product_code, proximity_score = self.is_product_related_image(
249
+ # image_bbox, text_blocks, page_text
250
+ # )
251
+
252
+ # if is_product and product_code:
253
+ # pix = fitz.Pixmap(doc, xref)
254
+ # if pix.n - pix.alpha > 3:
255
+ # pix = fitz.Pixmap(fitz.csRGB, pix)
256
+
257
+ # filename = f"page{page_num}_{product_code}_img{img_index+1}.png"
258
+ # image_path = os.path.join(self.image_save_dir, "product_images", filename)
259
+ # pix.save(image_path)
260
+
261
+ # image_data = {
262
+ # 'path': image_path,
263
+ # 'product_code': product_code,
264
+ # 'proximity_score': proximity_score
265
+ # }
266
+ # product_images.append(image_data)
267
+ # pix = None
268
+
269
+ # except Exception as e:
270
+ # print(f"Error extracting image {img_index+1}: {e}")
271
+
272
+ # return product_images
273
+
274
+ # def extract_product_data_with_images(self, pdf_file):
275
+ # """Main extraction function with automatic cleanup"""
276
+ # try:
277
+ # doc = fitz.open(pdf_file.name)
278
+ # total_pages = min(doc.page_count, 10) # Limit to 10 pages for CPU
279
+ # print(f"Processing {total_pages} pages on CPU...")
280
+ # except Exception as e:
281
+ # return None, f"Error opening PDF: {e}"
282
+
283
+ # all_product_images = {}
284
+ # product_data_tracker = {}
285
+
286
+ # system_prompt = """You are a data extraction assistant.
287
+ # Extract the item details from the provided text.
288
+ # Provide the output as a JSON object, where each object represents an item and has the following keys: 'Flag', 'Product Code', 'Description', 'Manufacturer', 'Supplier', 'Material', 'Dimensions', and 'Product Image'.
289
+ # If a key's value is not found in the text for an item, provide an empty string "".
290
+ # If no items are found, return an empty JSON [].
291
+ # Do not include any extra text or formatting outside the JSON object.
292
+ # Include rows with unique Product Code values only."""
293
+
294
+ # for page_num in range(total_pages):
295
+ # page = doc.load_page(page_num)
296
+ # page_text = page.get_text()
297
+
298
+ # if len(page_text.strip()) < 50: # Skip mostly empty pages
299
+ # continue
300
+
301
+ # print(f"Processing page {page_num + 1}...")
302
+
303
+ # # Extract images
304
+ # product_images = self.extract_and_classify_images(page, page_num + 1, doc)
305
+ # for img_data in product_images:
306
+ # if img_data['product_code']:
307
+ # if img_data['product_code'] not in all_product_images:
308
+ # all_product_images[img_data['product_code']] = []
309
+ # all_product_images[img_data['product_code']].append(img_data)
310
+
311
+ # # Extract text data (CPU-optimized processing)
312
+ # prompt = f"{system_prompt}\n\nText:\n{page_text[:2000]}\n\nOutput JSON:" # Limit text length
313
+ # raw_output = self.generate_text(prompt)
314
+
315
+ # try:
316
+ # # Parse JSON response
317
+ # json_start = raw_output.find('[')
318
+ # json_end = raw_output.rfind(']') + 1
319
 
320
+ # if json_start != -1 and json_end != 0:
321
+ # json_str = raw_output[json_start:json_end]
322
+ # else:
323
+ # json_str = raw_output.strip()
324
+
325
+ # parsed_data = json.loads(json_str)
326
+ # if isinstance(parsed_data, dict):
327
+ # parsed_data = [parsed_data]
328
+ # elif not isinstance(parsed_data, list):
329
+ # parsed_data = []
330
+
331
+ # for item in parsed_data:
332
+ # if isinstance(item, dict):
333
+ # product_code = item.get('Product Code', '').strip()
334
+ # if not product_code:
335
+ # continue
336
+
337
+ # # Find best matching image
338
+ # image_path = ""
339
+ # if product_code in all_product_images:
340
+ # best_image = max(all_product_images[product_code],
341
+ # key=lambda x: x['proximity_score'])
342
+ # image_path = best_image['path']
343
+
344
+ # current_item_data = {
345
+ # "pdf_page_number": page_num + 1,
346
+ # "Flag": item.get('Flag', ''),
347
+ # "Product Code": product_code,
348
+ # "Description": item.get('Description', ''),
349
+ # "Manufacturer": item.get('Manufacturer', ''),
350
+ # "Supplier": item.get('Supplier', ''),
351
+ # "Material": item.get('Material', ''),
352
+ # "Dimensions": item.get('Dimensions', ''),
353
+ # "Product Image": item.get('Product Image', ''),
354
+ # "Product Image File": image_path,
355
+ # }
356
+
357
+ # if product_code not in product_data_tracker:
358
+ # product_data_tracker[product_code] = current_item_data
359
+
360
+ # except Exception as e:
361
+ # print(f"Error processing page {page_num + 1}: {e}")
362
+
363
+ # doc.close()
364
+ # final_data = list(product_data_tracker.values())
365
+ # return final_data, None
366
+
367
+ # def create_excel_with_images_and_cleanup(data, extractor, output_filename="product_data_with_images.xlsx"):
368
+ # """Create Excel file with embedded images, then clean up image files"""
369
+ # if not data:
370
+ # return None
371
 
372
+ # df = pd.DataFrame(data)
373
 
374
+ # try:
375
+ # with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
376
+ # df.to_excel(writer, sheet_name='Product Data', index=False)
377
 
378
+ # workbook = writer.book
379
+ # worksheet = writer.sheets['Product Data']
380
 
381
+ # # Set column widths
382
+ # for col_idx, column_name in enumerate(df.columns):
383
+ # if column_name == "Product Image":
384
+ # worksheet.set_column(col_idx, col_idx, 20)
385
+ # elif column_name in ["Description", "Material"]:
386
+ # worksheet.set_column(col_idx, col_idx, 30)
387
+ # else:
388
+ # worksheet.set_column(col_idx, col_idx, 15)
389
 
390
+ # # Add header formatting
391
+ # header_format = workbook.add_format({
392
+ # 'bold': True,
393
+ # 'text_wrap': True,
394
+ # 'valign': 'top',
395
+ # 'fg_color': '#D7E4BC',
396
+ # 'border': 1
397
+ # })
398
 
399
+ # for col_num, value in enumerate(df.columns.values):
400
+ # worksheet.write(0, col_num, value, header_format)
401
 
402
+ # # Embed images
403
+ # try:
404
+ # image_col_index = df.columns.get_loc("Product Image")
405
 
406
+ # for row_num in range(1, len(df) + 1):
407
+ # image_path = df.iloc[row_num - 1]['Product Image File']
408
 
409
+ # if image_path and os.path.exists(image_path):
410
+ # try:
411
+ # worksheet.set_row(row_num, 80)
412
+ # worksheet.insert_image(
413
+ # row_num, image_col_index, image_path,
414
+ # {'x_scale': 0.2, 'y_scale': 0.2, 'x_offset': 5, 'y_offset': 5}
415
+ # )
416
+ # except Exception as e:
417
+ # print(f"Error embedding image: {e}")
418
 
419
+ # except KeyError:
420
+ # pass
421
 
422
+ # print("✅ Excel file created successfully")
423
 
424
+ # # Now clean up temporary image files
425
+ # extractor.cleanup_temp_files()
426
+ # print("🧹 Temporary image files cleaned up")
427
 
428
+ # # Also clean up the "Product Image File" column data to show cleanup
429
+ # df_clean = df.copy()
430
+ # df_clean['Product Image File'] = df_clean['Product Image File'].apply(
431
+ # lambda x: "✅ Embedded in Excel (temp files cleaned)" if x else ""
432
+ # )
433
 
434
+ # return output_filename, df_clean
435
 
436
+ # except Exception as e:
437
+ # print(f"Error creating Excel: {e}")
438
+ # # Still try to cleanup on error
439
+ # extractor.cleanup_temp_files()
440
+ # return None, df
441
+
442
+ # def process_pdf(pdf_file, progress=gr.Progress()):
443
+ # """Main processing function with automatic cleanup"""
444
+ # if pdf_file is None:
445
+ # return "Please upload a PDF file", None, None
446
 
447
+ # progress(0.1, desc="Initializing CPU-optimized extractor...")
448
 
449
+ # extractor = None
450
+ # try:
451
+ # extractor = ProductImageExtractor()
452
+ # except Exception as e:
453
+ # return f"Error initializing extractor: {e}", None, None
454
 
455
+ # progress(0.3, desc="Extracting data from PDF (CPU mode - may take 2-3 minutes)...")
456
+ # extracted_data, error = extractor.extract_product_data_with_images(pdf_file)
457
 
458
+ # if error:
459
+ # if extractor:
460
+ # extractor.cleanup_temp_files()
461
+ # return f"Error: {error}", None, None
462
 
463
+ # if not extracted_data:
464
+ # if extractor:
465
+ # extractor.cleanup_temp_files()
466
+ # return "No product data found in the PDF", None, None
467
 
468
+ # progress(0.7, desc="Creating Excel file and embedding images...")
469
+ # excel_file, df_clean = create_excel_with_images_and_cleanup(extracted_data, extractor)
470
 
471
+ # if excel_file is None:
472
+ # return "Error creating Excel file", pd.DataFrame(extracted_data), None
473
 
474
+ # progress(0.9, desc="Finalizing and cleaning up...")
475
 
476
+ # summary = f"""
477
+ # **✅ Extraction Completed Successfully!**
478
+
479
+ # **📊 Results:**
480
+ # - **Total items extracted:** {len(df_clean)}
481
+ # - **Items with product codes:** {len(df_clean[df_clean['Product Code'] != ''])}
482
+ # - **Items with images:** {len([x for x in extracted_data if x['Product Image File']])}
483
+ # - **Unique products:** {len(df_clean[df_clean['Product Code'] != '']['Product Code'].unique()) if len(df_clean[df_clean['Product Code'] != '']) > 0 else 0}
484
+
485
+ # **💻 CPU Processing:**
486
+ # - **Mode:** CPU-optimized inference
487
+ # - **Pages processed:** {df_clean['pdf_page_number'].max() if 'pdf_page_number' in df_clean.columns else 'N/A'}
488
+ # - **Images:** Embedded in Excel, temporary files cleaned up ✅
489
+
490
+ # **📥 Ready for Download!**
491
+ # """
492
 
493
+ # progress(1.0, desc="Complete!")
494
+ # return summary, df_clean, excel_file
495
+
496
+ # # Pre-load the model
497
+ # print("🚀 Initializing PDF Product Extractor (CPU Mode)...")
498
+ # print("Loading model into memory...")
499
+
500
+ # model, tokenizer = load_model_once()
501
+ # if model is None:
502
+ # print("❌ Failed to load model during startup")
503
+ # else:
504
+ # print("✅ Model successfully loaded and cached on CPU!")
505
+
506
+ # # Create Gradio interface
507
+ # with gr.Blocks(
508
+ # title="PDF Product Data Extractor - CPU Optimized",
509
+ # theme=gr.themes.Soft(),
510
+ # ) as demo:
511
 
512
+ # gr.HTML("""
513
+ # <div style="text-align: center; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; padding: 2rem; border-radius: 10px; margin-bottom: 2rem;">
514
+ # <h1>📄 PDF Product Data Extractor</h1>
515
+ # <p>🖥️ CPU-Optimized | 🧹 Auto-Cleanup | 📊 Memory Efficient</p>
516
+ # </div>
517
+ # """)
518
 
519
+ # gr.Markdown("""
520
+ # ### ⚡ **CPU-Optimized Features:**
521
+ # - **No GPU Required**: Runs efficiently on CPU-only environments
522
+ # - **Memory Efficient**: Automatic cleanup of temporary files
523
+ # - **Cost Effective**: Perfect for free Hugging Face Spaces
524
+ # - **Smart Processing**: Limited to 10 pages for optimal performance
525
 
526
+ # ### 🧹 **Automatic Cleanup:**
527
+ # - Images are temporarily extracted for processing
528
+ # - Embedded into Excel file during creation
529
+ # - All temporary image files automatically deleted
530
+ # - Keeps only the final Excel with embedded images
531
+ # """)
532
 
533
+ # with gr.Row():
534
+ # with gr.Column(scale=1):
535
+ # pdf_input = gr.File(
536
+ # label="📁 Upload PDF File",
537
+ # file_types=[".pdf"],
538
+ # file_count="single",
539
+ # height=120
540
+ # )
541
 
542
+ # extract_btn = gr.Button(
543
+ # "🔍 Extract Product Data (CPU Mode)",
544
+ # variant="primary",
545
+ # size="lg"
546
+ # )
547
 
548
+ # gr.Markdown("""
549
+ # **💡 CPU Mode Notes:**
550
+ # - Processing takes 2-3 minutes (vs 30 seconds on GPU)
551
+ # - Limited to 10 pages per PDF
552
+ # - Uses 4 CPU threads for stability
553
+ # - Temporary files auto-cleaned after Excel creation
554
+ # """)
555
 
556
+ # with gr.Column(scale=2):
557
+ # status_output = gr.Markdown(
558
+ # value="🖥️ CPU mode ready. Upload your PDF to begin processing..."
559
+ # )
560
 
561
+ # with gr.Row():
562
+ # with gr.Column():
563
+ # data_output = gr.Dataframe(
564
+ # label="📋 Extracted Product Data",
565
+ # wrap=True,
566
+ # interactive=False
567
+ # )
568
 
569
+ # with gr.Column():
570
+ # excel_output = gr.File(
571
+ # label="📥 Download Excel File",
572
+ # file_count="single"
573
+ # )
574
 
575
+ # extract_btn.click(
576
+ # fn=process_pdf,
577
+ # inputs=[pdf_input],
578
+ # outputs=[status_output, data_output, excel_output],
579
+ # show_progress=True
580
+ # )
581
 
582
+ # gr.Markdown("""
583
+ # ---
584
+ # **🔧 Technical Details:**
585
+ # - **Model**: Fine-tuned Qwen3-4B (CPU-optimized)
586
+ # - **Processing**: torch.float32, greedy decoding
587
+ # - **Memory**: Auto garbage collection, temp file cleanup
588
+ # - **Threads**: Limited to 4 CPU threads for stability
589
 
590
+ # **🧹 Cleanup Process:**
591
+ # 1. Images extracted to temporary directory
592
+ # 2. Data processed and Excel created with embedded images
593
+ # 3. Temporary image files automatically deleted
594
+ # 4. Only final Excel file retained with embedded images
595
+ # """)
596
+
597
+ # if __name__ == "__main__":
598
+ # demo.launch(
599
+ # server_name="0.0.0.0",
600
+ # server_port=7860,
601
+ # share=False,
602
+ # show_error=True
603
+ # )
604
+
605
+ import gradio as gr
606
+ import os
607
+ import shutil
608
+ import pandas as pd
609
+
610
+ from unsloth import FastLanguageModel
611
+ from temp_image import ProductImageExtractor, create_excel_with_embedded_images
612
+
613
+ # -------------------------------
614
+ # Load model once at startup
615
+ # -------------------------------
616
+ print("🚀 Loading fine-tuned model...")
617
+ model, tokenizer = FastLanguageModel.from_pretrained(
618
+ "pragnesh002/Qwen3-4B-Product-Extractor-GGUF-Q4-K-M",
619
+ max_seq_length=2048,
620
+ load_in_4bit=True,
621
+ fast_inference=True,
622
+ )
623
+ print("✅ Model loaded successfully!")
624
+
625
+
626
+ # -------------------------------
627
+ # PDF → Excel processing function
628
+ # -------------------------------
629
+ def process_pdf(pdf_file):
630
+ if not pdf_file or not pdf_file.name.endswith(".pdf"):
631
+ return "❌ Please upload a valid PDF file.", None
632
+
633
+ pdf_path = pdf_file.name
634
+ extractor = ProductImageExtractor(pdf_path, model, tokenizer)
635
+
636
+ # Extract product data
637
+ extracted_data = extractor.extract_product_data_with_images()
638
+
639
+ if not extracted_data:
640
+ return "⚠️ No product data extracted.", None
641
+
642
+ # Generate Excel file
643
+ output_excel = "product_data.xlsx"
644
+ create_excel_with_embedded_images(extracted_data, output_excel)
645
+
646
+ # Remove extracted images folder
647
+ if os.path.exists(extractor.image_save_dir):
648
+ shutil.rmtree(extractor.image_save_dir, ignore_errors=True)
649
+
650
+ return f"✅ Extraction complete. {len(extracted_data)} products found.", output_excel
651
+
652
+
653
+ # -------------------------------
654
+ # Gradio UI
655
+ # -------------------------------
656
+ with gr.Blocks() as demo:
657
+ gr.Markdown("## 📑 PDF → Excel Product Extractor (Qwen3-4B Fine-tuned)")
658
+ gr.Markdown("Upload a PDF → extract structured product data into Excel → auto-remove images after generation.")
659
+
660
+ with gr.Row():
661
+ pdf_input = gr.File(label="Upload PDF", type="file", file_types=[".pdf"])
662
+ run_btn = gr.Button("Extract to Excel")
663
+
664
+ status = gr.Textbox(label="Status", interactive=False)
665
+ excel_output = gr.File(label="Download Excel")
666
+
667
+ run_btn.click(process_pdf, inputs=pdf_input, outputs=[status, excel_output])
668
 
669
  if __name__ == "__main__":
670
+ demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
requirements.txt CHANGED
@@ -1,13 +1,26 @@
1
  gradio==5.18
2
  torch
3
- transformers
 
 
 
 
 
 
 
4
  PyMuPDF
5
  pandas
6
  xlsxwriter
7
  Pillow
8
  numpy
9
- accelerate
10
- sentencepiece
11
- huggingface_hub
12
- psutil
13
- websockets
 
 
 
 
 
 
 
1
  gradio==5.18
2
  torch
3
+ transformers==4.55.4
4
+ accelerate
5
+ sentencepiece
6
+ huggingface_hub
7
+ psutil
8
+ websockets
9
+
10
+ # PDF & Excel handling
11
  PyMuPDF
12
  pandas
13
  xlsxwriter
14
  Pillow
15
  numpy
16
+
17
+ # Unsloth & related
18
+ unsloth
19
+ vllm==0.10.1
20
+ triton==3.2.0
21
+ bitsandbytes
22
+ xformers
23
+
24
+ # Training helpers
25
+ trl
26
+ datasets
temp_image.py ADDED
@@ -0,0 +1,906 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """temp_image.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1g_LpdbYLQ7dGmAzUiG2X2gPQUsPkDN1D
8
+ """
9
+
10
+ # Commented out IPython magic to ensure Python compatibility.
11
+ # %%capture
12
+ # import os
13
+ # os.environ["UNSLOTH_VLLM_STANDBY"] = "1"
14
+ #
15
+ # # Install packages for Colab
16
+ # !pip install --upgrade -qqq uv
17
+ # try:
18
+ # import numpy; get_numpy = f"numpy=={numpy.__version__}"
19
+ # except:
20
+ # get_numpy = "numpy"
21
+ #
22
+ # try:
23
+ # import subprocess; is_t4 = "Tesla T4" in str(subprocess.check_output(["nvidia-smi"]))
24
+ # except:
25
+ # is_t4 = False
26
+ #
27
+ # get_vllm, get_triton = ("vllm==0.10.1", "triton==3.2.0") if is_t4 else ("vllm", "triton")
28
+ #
29
+ # !uv pip install -qqq --upgrade \
30
+ # unsloth {get_vllm} {get_numpy} torchvision bitsandbytes xformers
31
+ # !uv pip install -qqq {get_triton}
32
+ # !uv pip install transformers==4.55.4
33
+ # !uv pip install PyMuPDF xlsxwriter pillow
34
+ #
35
+ # print("All packages installed successfully!")
36
+
37
+ from unsloth import FastLanguageModel
38
+ import torch
39
+ import fitz # PyMuPDF
40
+ import json
41
+ import pandas as pd
42
+ import os
43
+ import re
44
+ import xlsxwriter
45
+ from PIL import Image, ImageDraw
46
+ import io
47
+ from collections import defaultdict
48
+ from vllm import SamplingParams
49
+ from trl import GRPOConfig, GRPOTrainer
50
+ from datasets import Dataset
51
+ import numpy as np
52
+ from google.colab import files
53
+ import zipfile
54
+ import matplotlib.pyplot as plt
55
+
56
+ # Model configuration
57
+ max_seq_length = 2048
58
+ lora_rank = 32
59
+
60
+ print("Loading model...")
61
+ model, tokenizer = FastLanguageModel.from_pretrained(
62
+ model_name="unsloth/Qwen3-4B-Base",
63
+ max_seq_length=max_seq_length,
64
+ load_in_4bit=False,
65
+ fast_inference=True,
66
+ max_lora_rank=lora_rank,
67
+ gpu_memory_utilization=0.7,
68
+ )
69
+
70
+ model = FastLanguageModel.get_peft_model(
71
+ model,
72
+ r=lora_rank,
73
+ target_modules=[
74
+ "q_proj", "k_proj", "v_proj", "o_proj",
75
+ "gate_proj", "up_proj", "down_proj",
76
+ ],
77
+ lora_alpha=lora_rank*2,
78
+ use_gradient_checkpointing="unsloth",
79
+ random_state=3407,
80
+ )
81
+
82
+ print("Model loaded successfully!")
83
+
84
+ print("Please upload your PDF file:")
85
+ uploaded = files.upload()
86
+
87
+ # Get the uploaded file name
88
+ pdf_file_path = list(uploaded.keys())[0]
89
+ print(f"Uploaded file: {pdf_file_path}")
90
+
91
+ # Verify the file
92
+ if not pdf_file_path.endswith('.pdf'):
93
+ print("Warning: Please ensure you uploaded a PDF file")
94
+ else:
95
+ print("PDF file ready for processing!")
96
+
97
+ new_system_prompt = """You are a data extraction assistant.
98
+ Extract the item details from the provided text.
99
+ Provide the output as a JSON object, where object represents an item and has the following keys: 'Flag', 'Product Code', 'Description', 'Manufacturer', 'Supplier', 'Material', 'Dimensions', and 'Product Image'.
100
+ If a key's value is not found in the text for an item, provide an empty string "".
101
+ If no items are found, return an empty JSON {}.
102
+ Do not include any extra text or formatting outside the JSON object.
103
+ Include rows with unique Product Code values only.
104
+ For the 'Dimensions' field, extract all dimension information found (e.g., Height, Width, Depth, Diameter, Length) and format them as a single string of key-value pairs separated by semicolons, like "Height: [value]; Width: [value]; Diameter: [value]". If a specific dimension is not available, do not include its key-value pair in the string.
105
+ If we found the data from first page then take those only If there are any missing details or extra details then include with it.
106
+ Do not include any duplicate data in any key of JSON."""
107
+
108
+ # Your existing training data
109
+ annotated_data_examples = [
110
+ {
111
+ "prompt": [
112
+ {"role": "system", "content": new_system_prompt},
113
+ {"role": "user", "content": "Text:\nProject Name: Anse La Mouche\nItem Number: GR-AA10\nDescription: Wall Hanging Art Work\nManufacturer: Harper + Wilde\nSupplier: Harper + Wilde\nMaterial/Finish: Hand Rolled Clay Beads, Cuttlefish Bone, Hemp Rope\nDimensions: Height: 300mm; Width: 250mm\nImage:\n[Image Placeholder]\n\nOutput JSON:"},
114
+ ],
115
+ "answer": '[{"Flag": "", "Product Code": "GR-AA10", "Description": "Wall Hanging Art Work", "Manufacturer": "Harper + Wilde", "Supplier": "Harper + Wilde", "Material": "Hand Rolled Clay Beads, Cuttlefish Bone, Hemp Rope", "Dimensions": "Height: 300mm; Width: 250mm", "Product Image": ""}]',
116
+ },
117
+ {
118
+ "prompt": [
119
+ {"role": "system", "content": new_system_prompt},
120
+ {"role": "user", "content": "Text:\nProject Name: Anse La Mouche\nItem Number: GR-AA12\nDescription: Mirror\nManufacturer: By Contractor\nMaterial/Finish: Clear Mirror (GR-GL02), Powder-Coated Black Aluminium Frame (GR-M03)\nDimensions: Height: 1010mm; Width: 600mm; Depth: 40mm\n\nOutput JSON:"},
121
+ ],
122
+ "answer": '[{"Flag": "", "Product Code": "GR-AA12", "Description": "Mirror", "Manufacturer": "By Contractor", "Supplier": "", "Material": "Clear Mirror (GR-GL02), Powder-Coated Black Aluminium Frame (GR-M03)", "Dimensions": "Height: 1010mm; Width: 600mm; Depth: 40mm", "Product Image": ""}]',
123
+ },
124
+ ]
125
+
126
+ grpo_training_dataset = Dataset.from_list(annotated_data_examples)
127
+ print("Training dataset created!")
128
+
129
+ def format_reward(completions, **kwargs):
130
+ scores = []
131
+ for completion in completions:
132
+ score = 0.0
133
+ if completion and isinstance(completion, list) and len(completion) > 0 and 'content' in completion[0]:
134
+ response = completion[0]['content']
135
+ try:
136
+ parsed_response = json.loads(response.strip())
137
+ if isinstance(parsed_response, list):
138
+ score += 3.0
139
+ else:
140
+ score -= 1.0
141
+ except json.JSONDecodeError:
142
+ score -= 2.0
143
+ else:
144
+ score -= 2.0
145
+ scores.append(score)
146
+ return scores
147
+
148
+ def accuracy_reward(prompts, completions, answer, **kwargs):
149
+ scores = []
150
+ expected_keys = ['Flag', 'Product Code', 'Description', 'Manufacturer', 'Supplier', 'Material', 'Dimensions', 'Product Image']
151
+
152
+ for completion, true_answer_str in zip(completions, answer):
153
+ score = 0.0
154
+ if completion and isinstance(completion, list) and len(completion) > 0 and 'content' in completion[0]:
155
+ response = completion[0]['content']
156
+ try:
157
+ parsed_response = json.loads(response.strip())
158
+ true_data = json.loads(true_answer_str.strip())
159
+
160
+ if isinstance(parsed_response, list) and isinstance(true_data, list):
161
+ match_count = 0
162
+ total_items = max(len(parsed_response), len(true_data))
163
+
164
+ for i in range(total_items):
165
+ parsed_item = parsed_response[i] if i < len(parsed_response) and isinstance(parsed_response[i], dict) else {}
166
+ true_item = true_data[i] if i < len(true_data) and isinstance(true_data[i], dict) else {}
167
+
168
+ key_matches = 0
169
+ for key in expected_keys:
170
+ parsed_value = parsed_item.get(key, "")
171
+ true_value = true_item.get(key, "")
172
+ if str(parsed_value).strip() == str(true_value).strip():
173
+ key_matches += 1
174
+
175
+ if len(expected_keys) > 0:
176
+ match_count += key_matches / len(expected_keys)
177
+
178
+ if total_items > 0:
179
+ score += 5.0 * (match_count / total_items)
180
+ else:
181
+ if len(parsed_response) == 0 and len(true_data) == 0:
182
+ score += 5.0
183
+ else:
184
+ score -= 2.0
185
+ else:
186
+ score -= 2.0
187
+ except json.JSONDecodeError:
188
+ score -= 3.0
189
+ else:
190
+ score -= 2.0
191
+ scores.append(score)
192
+ return scores
193
+
194
+ # Quick training (uncomment if needed)
195
+ print("Training model... (This may take a few minutes)")
196
+
197
+ chat_template = \
198
+ "{% if messages[0]['role'] == 'system' %}"\
199
+ "{{ messages[0]['content'] + eos_token }}"\
200
+ "{% set loop_messages = messages[1:] %}"\
201
+ "{% else %}"\
202
+ "{{ new_system_prompt + eos_token }}"\
203
+ "{% set loop_messages = messages %}"\
204
+ "{% endif %}"\
205
+ "{% for message in loop_messages %}"\
206
+ "{% if message['role'] == 'user' %}"\
207
+ "{{ message['content'] }}"\
208
+ "{% elif message['role'] == 'assistant' %}"\
209
+ "{{ message['content'] + eos_token }}"\
210
+ "{% endif %}"\
211
+ "{% endfor %}"
212
+
213
+ tokenizer.chat_template = chat_template
214
+
215
+ vllm_sampling_params = SamplingParams(
216
+ temperature=1.0,
217
+ top_k=50,
218
+ max_tokens=1024,
219
+ stop=[tokenizer.eos_token],
220
+ include_stop_str_in_output=True,
221
+ )
222
+
223
+ training_args = GRPOConfig(
224
+ vllm_sampling_params=vllm_sampling_params,
225
+ temperature=1.0,
226
+ learning_rate=5e-6,
227
+ weight_decay=0.01,
228
+ warmup_ratio=0.1,
229
+ lr_scheduler_type="linear",
230
+ optim="adamw_8bit",
231
+ logging_steps=1,
232
+ per_device_train_batch_size=2, # Reduced for Colab
233
+ gradient_accumulation_steps=1,
234
+ max_prompt_length=512,
235
+ max_completion_length=512,
236
+ max_steps=10, # Reduced for quick demo
237
+ save_steps=10,
238
+ report_to="none",
239
+ output_dir="outputs",
240
+ )
241
+
242
+ trainer = GRPOTrainer(
243
+ model=model,
244
+ processing_class=tokenizer,
245
+ reward_funcs=[format_reward, accuracy_reward],
246
+ args=training_args,
247
+ train_dataset=grpo_training_dataset,
248
+ )
249
+
250
+ trainer.train()
251
+ model.save_lora("grpo_saved_lora")
252
+ print("Model training completed and saved!")
253
+
254
+ class ProductImageExtractor:
255
+ def __init__(self, pdf_path, model, tokenizer):
256
+ self.pdf_path = pdf_path
257
+ self.model = model
258
+ self.tokenizer = tokenizer
259
+ self.doc = None
260
+ self.lora_request = None
261
+ self.image_save_dir = "extracted_product_images"
262
+ self.load_lora("grpo_saved_lora")
263
+ self.setup_directories()
264
+
265
+ def load_lora(self, lora_path):
266
+ """Load trained LoRA adapter"""
267
+ if os.path.exists(lora_path):
268
+ try:
269
+ self.lora_request = self.model.load_lora(lora_path)
270
+ print(f"LoRA adapter loaded from {lora_path}")
271
+ except Exception as e:
272
+ print(f"Error loading LoRA: {e}")
273
+ self.lora_request = None
274
+
275
+ def setup_directories(self):
276
+ """Create necessary directories"""
277
+ os.makedirs(self.image_save_dir, exist_ok=True)
278
+ os.makedirs(f"{self.image_save_dir}/product_images", exist_ok=True)
279
+ os.makedirs(f"{self.image_save_dir}/non_product_images", exist_ok=True)
280
+ print("Directories created for image storage")
281
+
282
+ # def is_product_related_image(self, image_bbox, text_blocks, page_text):
283
+ # """Determine if an image is product-related based on spatial proximity"""
284
+ # # Extract product codes from page text
285
+ # product_code_pattern = r'\b[A-Z]{2}-[A-Z]{2}\d+[a-z]?\b'
286
+ # product_codes = re.findall(product_code_pattern, page_text)
287
+
288
+ # print('--product codes', product_codes)
289
+
290
+ # if not product_codes:
291
+ # return False, None, 0.0
292
+
293
+ # # Find text blocks containing product codes
294
+ # product_text_blocks = []
295
+ # for block in text_blocks:
296
+ # if len(block) < 5:
297
+ # continue
298
+ # block_text = block[4] # Text content
299
+ # if any(code in block_text for code in product_codes):
300
+ # product_text_blocks.append({
301
+ # 'bbox': block[:4], # x0, y0, x1, y1
302
+ # 'text': block_text,
303
+ # 'codes': [code for code in product_codes if code in block_text]
304
+ # })
305
+
306
+ # if not product_text_blocks:
307
+ # return False, None, 0.0
308
+
309
+ # # Calculate proximity scores
310
+ # max_proximity_score = 0.0
311
+ # closest_product_code = None
312
+
313
+ # for block in product_text_blocks:
314
+ # print('--product codes block', block['codes'])
315
+ # proximity_score = self.calculate_proximity_score(image_bbox, block['bbox'])
316
+ # if proximity_score > max_proximity_score:
317
+ # max_proximity_score = proximity_score
318
+ # closest_product_code = block['codes'][0] if block['codes'] else None
319
+
320
+ # # Additional filters for non-product images
321
+ # image_area = (image_bbox[2] - image_bbox[0]) * (image_bbox[3] - image_bbox[1])
322
+
323
+ # # Filter out very small images (likely icons/logos)
324
+ # if image_area < 3000: # Adjusted threshold
325
+ # return False, closest_product_code, max_proximity_score
326
+
327
+ # # Filter out images in header/footer areas
328
+ # page_height = 842 # A4 page height in points
329
+ # if image_bbox[1] < 80 or image_bbox[3] > page_height - 80:
330
+ # return False, closest_product_code, max_proximity_score
331
+
332
+ # # Consider it product-related if proximity score is above threshold
333
+ # is_product = max_proximity_score > 0.2 # Lowered threshold for better detection
334
+
335
+ # return is_product, closest_product_code, max_proximity_score
336
+
337
+ def is_product_related_image(self, image_bbox, text_blocks, page_text):
338
+ """Determine if an image is product-related based on spatial proximity"""
339
+ # Extract product codes from page text
340
+ product_code_pattern = r'\b[A-Z]{2}-[A-Z]{2}\d+[a-z]?\b'
341
+ product_codes = re.findall(product_code_pattern, page_text)
342
+
343
+ print('--product codes', product_codes)
344
+
345
+ if not product_codes:
346
+ return False, None, 0.0
347
+
348
+ # Find text blocks containing product codes
349
+ product_text_blocks = []
350
+ for block in text_blocks:
351
+ if len(block) < 5:
352
+ continue
353
+ block_text = block[4] # Text content
354
+ if any(code in block_text for code in product_codes):
355
+ product_text_blocks.append({
356
+ 'bbox': block[:4], # x0, y0, x1, y1
357
+ 'text': block_text,
358
+ 'codes': [code for code in product_codes if code in block_text]
359
+ })
360
+
361
+ if not product_text_blocks:
362
+ return False, None, 0.0
363
+
364
+ # Calculate proximity scores
365
+ max_proximity_score = 0.0
366
+ closest_product_code = None
367
+
368
+ for block in product_text_blocks:
369
+ print('--product codes block', block['codes'])
370
+ proximity_score = self.calculate_proximity_score(image_bbox, block['bbox'])
371
+
372
+ # Immediate return if a high score is found
373
+ if proximity_score > 0.2: # Use the same threshold as the final check
374
+ max_proximity_score = proximity_score
375
+ closest_product_code = block['codes'][0] if block['codes'] else None
376
+ is_product = self.additional_filters(image_bbox, max_proximity_score)
377
+ return is_product, closest_product_code, max_proximity_score
378
+
379
+ if proximity_score > max_proximity_score:
380
+ max_proximity_score = proximity_score
381
+ closest_product_code = block['codes'][0] if block['codes'] else None
382
+
383
+ # Apply additional filters to the best-found score
384
+ is_product = self.additional_filters(image_bbox, max_proximity_score)
385
+
386
+ return is_product, closest_product_code, max_proximity_score
387
+
388
+
389
+ def additional_filters(self, image_bbox, max_proximity_score):
390
+ """Helper function to apply additional filters"""
391
+ image_area = (image_bbox[2] - image_bbox[0]) * (image_bbox[3] - image_bbox[1])
392
+
393
+ # Filter out very small images (likely icons/logos)
394
+ if image_area < 3000:
395
+ return False
396
+
397
+ # Filter out images in header/footer areas
398
+ page_height = 842 # A4 page height in points
399
+ if image_bbox[1] < 80 or image_bbox[3] > page_height - 80:
400
+ return False
401
+
402
+ # Consider it product-related if proximity score is above threshold
403
+ return max_proximity_score > 0.2
404
+
405
+ def calculate_proximity_score(self, image_bbox, text_bbox):
406
+ """Calculate proximity score between image and text bounding boxes"""
407
+ img_center_x = (image_bbox[0] + image_bbox[2]) / 2
408
+ img_center_y = (image_bbox[1] + image_bbox[3]) / 2
409
+ text_center_x = (text_bbox[0] + text_bbox[2]) / 2
410
+ text_center_y = (text_bbox[1] + text_bbox[3]) / 2
411
+
412
+ distance = ((img_center_x - text_center_x) ** 2 + (img_center_y - text_center_y) ** 2) ** 0.5
413
+ proximity_score = max(0, 1 - (distance / 800)) # Adjusted for better scoring
414
+
415
+ return proximity_score
416
+
417
+ def extract_and_classify_images(self, page, page_num):
418
+ """Extract images from page and classify as product-related or not"""
419
+ images = page.get_images(full=True)
420
+ text_blocks = page.get_text("blocks")
421
+ page_text = page.get_text()
422
+
423
+ product_images = []
424
+ non_product_images = []
425
+
426
+ for img_index, img_info in enumerate(images):
427
+ xref = img_info[0]
428
+
429
+ try:
430
+ # Get image bounding box
431
+ image_list = page.get_image_rects(xref)
432
+ if not image_list:
433
+ continue
434
+
435
+ image_bbox = image_list[0] # First occurrence
436
+
437
+ # Classify image
438
+ is_product, product_code, proximity_score = self.is_product_related_image(
439
+ image_bbox, text_blocks, page_text
440
+ )
441
+
442
+ # Extract and save image
443
+ pix = fitz.Pixmap(self.doc, xref)
444
+
445
+ if pix.n - pix.alpha > 3: # Handle CMYK images
446
+ pix = fitz.Pixmap(fitz.csRGB, pix)
447
+
448
+ # Generate filename
449
+ if is_product and product_code:
450
+ category = "product_images"
451
+ filename = f"page{page_num}_{product_code}_img{img_index+1}.png"
452
+ else:
453
+ category = "non_product_images"
454
+ filename = f"page{page_num}_generic_img{img_index+1}.png"
455
+
456
+ image_path = os.path.join(self.image_save_dir, category, filename)
457
+ pix.save(image_path)
458
+
459
+ image_data = {
460
+ 'path': image_path,
461
+ 'bbox': image_bbox,
462
+ 'product_code': product_code,
463
+ 'proximity_score': proximity_score,
464
+ 'xref': xref,
465
+ 'size': (pix.width, pix.height)
466
+ }
467
+
468
+ if is_product:
469
+ product_images.append(image_data)
470
+ print(f"✓ Product image: {filename} (Code: {product_code}, Score: {proximity_score:.2f})")
471
+ else:
472
+ non_product_images.append(image_data)
473
+ print(f"• Non-product image: {filename}")
474
+
475
+ pix = None # Release memory
476
+
477
+ except Exception as e:
478
+ print(f"Error extracting image {img_index+1} on page {page_num}: {e}")
479
+
480
+ return product_images, non_product_images
481
+
482
+ def merge_product_data(self, first_page_item, additional_item):
483
+ """Merge product data, prioritizing first page data but filling in missing details"""
484
+ merged_item = first_page_item.copy()
485
+
486
+ # Fill in missing or empty fields from additional item
487
+ for key in ['Flag', 'Description', 'Manufacturer', 'Supplier', 'Material', 'Dimensions', 'Product Image']:
488
+ if not merged_item.get(key, '').strip() and additional_item.get(key, '').strip():
489
+ merged_item[key] = additional_item[key]
490
+ print(f" → Added missing {key}: {additional_item[key][:50]}...")
491
+
492
+ # For image, prefer the one with better proximity score or first occurrence
493
+ if not merged_item.get('Product Image File', '') and additional_item.get('Product Image File', ''):
494
+ merged_item['Product Image File'] = additional_item['Product Image File']
495
+ print(f" → Added missing image: {os.path.basename(additional_item['Product Image File'])}")
496
+
497
+ return merged_item
498
+
499
+ def extract_product_data_with_images(self):
500
+ """Main extraction function with duplicate consolidation"""
501
+ try:
502
+ self.doc = fitz.open(self.pdf_path)
503
+ total_pages = self.doc.page_count # Store page count before processing
504
+ print(f"Processing PDF: {self.pdf_path}")
505
+ print(f"Total pages: {total_pages}")
506
+ except Exception as e:
507
+ print(f"Error opening PDF: {e}")
508
+ return None
509
+
510
+ all_product_images = {} # Dict to store images by product code
511
+ product_data_tracker = {} # Track products by code to avoid duplicates
512
+
513
+ # Setup inference parameters
514
+ sampling_params = SamplingParams(
515
+ temperature=0.1,
516
+ top_p=1.0,
517
+ max_tokens=1024,
518
+ stop=[self.tokenizer.eos_token],
519
+ include_stop_str_in_output=True,
520
+ )
521
+
522
+ for page_num in range(total_pages):
523
+ page = self.doc.load_page(page_num)
524
+ page_text = page.get_text()
525
+
526
+ print(f"\n--- Processing page {page_num + 1} ---")
527
+
528
+ # Extract and classify images
529
+ product_images, non_product_images = self.extract_and_classify_images(page, page_num + 1)
530
+
531
+ # Group product images by product code
532
+ for img_data in product_images:
533
+ if img_data['product_code']:
534
+ if img_data['product_code'] not in all_product_images:
535
+ all_product_images[img_data['product_code']] = []
536
+ all_product_images[img_data['product_code']].append(img_data)
537
+
538
+ # Extract product data using trained model
539
+ messages = [
540
+ {"role": "system", "content": new_system_prompt},
541
+ {"role": "user", "content": f"Text:\n{page_text}\n\nOutput JSON:"},
542
+ ]
543
+
544
+ prompt_text = self.tokenizer.apply_chat_template(
545
+ messages,
546
+ add_generation_prompt=False,
547
+ tokenize=False,
548
+ )
549
+
550
+ try:
551
+ raw_model_output = self.model.fast_generate(
552
+ prompt_text,
553
+ sampling_params=sampling_params,
554
+ lora_request=self.lora_request,
555
+ )[0].outputs[0].text
556
+
557
+ # Parse model output
558
+ cleaned_output = raw_model_output.strip()
559
+ parsed_data = json.loads(cleaned_output)
560
+
561
+ if isinstance(parsed_data, dict):
562
+ parsed_data = [parsed_data]
563
+ elif not isinstance(parsed_data, list):
564
+ parsed_data = []
565
+
566
+ # Process extracted items and handle duplicates
567
+ for item in parsed_data:
568
+ if isinstance(item, dict):
569
+ product_code = item.get('Product Code', '').strip()
570
+
571
+ # Skip items without product codes
572
+ if not product_code:
573
+ continue
574
+
575
+ # Find best matching image for this product
576
+ image_path = ""
577
+ if product_code in all_product_images:
578
+ best_image = max(
579
+ all_product_images[product_code],
580
+ key=lambda x: x['proximity_score']
581
+ )
582
+ image_path = best_image['path']
583
+
584
+ # Create complete item record
585
+ current_item_data = {
586
+ "pdf_page_number": page_num + 1,
587
+ "Flag": item.get('Flag', ''),
588
+ "Product Code": product_code,
589
+ "Description": item.get('Description', ''),
590
+ "Manufacturer": item.get('Manufacturer', ''),
591
+ "Supplier": item.get('Supplier', ''),
592
+ "Material": item.get('Material', ''),
593
+ "Dimensions": item.get('Dimensions', ''),
594
+ "Product Image": item.get('Product Image', ''),
595
+ "Product Image File": image_path,
596
+ }
597
+
598
+ # Check if this product code already exists
599
+ if product_code in product_data_tracker:
600
+ print(f" ! Duplicate found for {product_code} on page {page_num + 1}")
601
+
602
+ # Merge with existing data (prioritize first occurrence)
603
+ existing_item = product_data_tracker[product_code]
604
+ merged_item = self.merge_product_data(existing_item, current_item_data)
605
+ product_data_tracker[product_code] = merged_item
606
+ else:
607
+ # First occurrence of this product code
608
+ print(f" ✓ New product: {product_code}")
609
+ if image_path:
610
+ print(f" → Linked image: {os.path.basename(image_path)}")
611
+
612
+ product_data_tracker[product_code] = current_item_data
613
+
614
+ except Exception as e:
615
+ print(f"Error processing page {page_num + 1}: {e}")
616
+
617
+ # Close document before processing final data
618
+ self.doc.close()
619
+
620
+ # Convert tracker to final list (this ensures no duplicates)
621
+ final_data = list(product_data_tracker.values())
622
+
623
+ print(f"\n=== DEDUPLICATION SUMMARY ===")
624
+ print(f"Unique products found: {len(final_data)}")
625
+ print(f"Pages processed: {total_pages}")
626
+
627
+ # Verify no duplicates exist
628
+ product_codes = [item.get('Product Code', '') for item in final_data]
629
+ unique_codes = set(product_codes)
630
+ if len(product_codes) != len(unique_codes):
631
+ print(f"WARNING: Found {len(product_codes) - len(unique_codes)} duplicate entries!")
632
+ else:
633
+ print("✓ No duplicate product codes confirmed")
634
+
635
+ return final_data
636
+
637
+ print("ProductImageExtractor class defined!")
638
+
639
+ print("Starting extraction process...")
640
+
641
+ # Initialize extractor
642
+ extractor = ProductImageExtractor(pdf_file_path, model, tokenizer)
643
+
644
+ # Extract data and images
645
+ extracted_data = extractor.extract_product_data_with_images()
646
+
647
+ if extracted_data:
648
+ # Convert to DataFrame for display
649
+ df_results = pd.DataFrame(extracted_data)
650
+ print(f"\n=== EXTRACTION COMPLETED ===")
651
+ print(f"Total items extracted: {len(df_results)}")
652
+ print(f"Items with product images: {len([item for item in extracted_data if item['Product Image File']])}")
653
+
654
+ # Display first few results
655
+ print("\n=== SAMPLE RESULTS ===")
656
+ display_columns = ['Product Code', 'Description', 'Manufacturer', 'Product Image File']
657
+ print(df_results[display_columns].head(10).to_string(index=False))
658
+ else:
659
+ print("Failed to extract data from PDF")
660
+
661
+ def create_excel_with_embedded_images(data, output_filename):
662
+ """Create Excel file with properly embedded and displayed images"""
663
+ df = pd.DataFrame(data)
664
+
665
+ print(f"Creating Excel file: {output_filename}")
666
+
667
+ # Create Excel writer with xlsxwriter engine
668
+ with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
669
+ df.to_excel(writer, sheet_name='Product Data', index=False)
670
+
671
+ workbook = writer.book
672
+ worksheet = writer.sheets['Product Data']
673
+
674
+ # Auto-calculate column widths based on content length
675
+ def calculate_column_width(column_data, column_name, min_width=8, max_width=50):
676
+ """Calculate optimal column width based on content"""
677
+ if len(column_data) == 0:
678
+ return min_width
679
+
680
+ # Get max length of content in this column
681
+ max_length = max(
682
+ len(str(value)) for value in [column_name] + list(column_data)
683
+ )
684
+
685
+ # Apply some padding and limits
686
+ optimal_width = min(max(max_length * 1.2, min_width), max_width)
687
+ return optimal_width
688
+
689
+ # Set auto-calculated column widths
690
+ for col_idx, column_name in enumerate(df.columns):
691
+ if column_name == "Product Image":
692
+ # Increased width for image column to prevent overflow
693
+ worksheet.set_column(col_idx, col_idx, 20)
694
+ elif column_name == "Product Image File":
695
+ # Fixed width for image file path column
696
+ worksheet.set_column(col_idx, col_idx, 25)
697
+ elif column_name == "Description":
698
+ # Limit description width to avoid too wide columns
699
+ width = calculate_column_width(df[column_name], column_name, min_width=15, max_width=40)
700
+ worksheet.set_column(col_idx, col_idx, width)
701
+ elif column_name == "Material":
702
+ width = calculate_column_width(df[column_name], column_name, min_width=12, max_width=35)
703
+ worksheet.set_column(col_idx, col_idx, width)
704
+ elif column_name == "Dimensions":
705
+ width = calculate_column_width(df[column_name], column_name, min_width=15, max_width=30)
706
+ worksheet.set_column(col_idx, col_idx, width)
707
+ else:
708
+ # Auto-calculate for other columns
709
+ width = calculate_column_width(df[column_name], column_name)
710
+ worksheet.set_column(col_idx, col_idx, width)
711
+
712
+ print(f"Column '{column_name}': width = {width if 'width' in locals() else 'auto'}")
713
+
714
+ # Find the image column index
715
+ try:
716
+ image_col_index = df.columns.get_loc("Product Image")
717
+
718
+ # Uniform image size settings
719
+ UNIFORM_IMAGE_WIDTH = 120 # pixels
720
+ UNIFORM_IMAGE_HEIGHT = 120 # pixels
721
+ CELL_ROW_HEIGHT = 100 # points (Excel row height)
722
+
723
+ # Insert images into cells with uniform sizing
724
+ images_inserted = 0
725
+ for row_num in range(1, len(df) + 1): # Start from row 1 (skip header)
726
+ image_path = df.iloc[row_num - 1]['Product Image File']
727
+
728
+ if image_path and os.path.exists(image_path):
729
+ try:
730
+ # Set consistent row height for all image rows
731
+ worksheet.set_row(row_num, CELL_ROW_HEIGHT)
732
+
733
+ # Get original image dimensions to calculate scaling
734
+ with Image.open(image_path) as img:
735
+ original_width, original_height = img.size
736
+
737
+ # Calculate scaling factors to achieve uniform size
738
+ scale_x = UNIFORM_IMAGE_WIDTH / original_width
739
+ scale_y = UNIFORM_IMAGE_HEIGHT / original_height
740
+
741
+ # Use the smaller scale to maintain aspect ratio while fitting in target size
742
+ uniform_scale = min(scale_x, scale_y)
743
+
744
+ # Insert image with uniform scaling
745
+ worksheet.insert_image(
746
+ row_num, image_col_index, image_path,
747
+ {
748
+ 'x_scale': uniform_scale,
749
+ 'y_scale': uniform_scale,
750
+ 'x_offset': 5, # Small offset from cell border
751
+ 'y_offset': 5,
752
+ 'positioning': 1 # Move and size with cells
753
+ }
754
+ )
755
+ images_inserted += 1
756
+
757
+ print(f" → Inserted uniform image {images_inserted}: {os.path.basename(image_path)} "
758
+ f"(scale: {uniform_scale:.2f}, orig: {original_width}x{original_height})")
759
+
760
+ except Exception as e:
761
+ print(f"Error embedding image {image_path}: {e}")
762
+
763
+ print(f"\nExcel file created with {images_inserted} uniformly-sized embedded images!")
764
+ print(f"All images scaled to approximately {UNIFORM_IMAGE_WIDTH}x{UNIFORM_IMAGE_HEIGHT} pixels")
765
+
766
+ except KeyError:
767
+ print("Product Image File column not found")
768
+
769
+ # Add formatting for better appearance
770
+ header_format = workbook.add_format({
771
+ 'bold': True,
772
+ 'text_wrap': True,
773
+ 'valign': 'top',
774
+ 'fg_color': '#D7E4BC',
775
+ 'border': 1
776
+ })
777
+
778
+ # Apply header formatting
779
+ for col_num, value in enumerate(df.columns.values):
780
+ worksheet.write(0, col_num, value, header_format)
781
+
782
+ # Add text wrapping for content cells
783
+ wrap_format = workbook.add_format({
784
+ 'text_wrap': True,
785
+ 'valign': 'top',
786
+ 'border': 1
787
+ })
788
+
789
+ image_cell_format = workbook.add_format({
790
+ 'border': 1,
791
+ 'valign': 'top'
792
+ })
793
+
794
+ # Apply text wrapping to data cells (excluding image column)
795
+ for row_num in range(1, len(df) + 1):
796
+ for col_num in range(len(df.columns)):
797
+ cell_value = df.iloc[row_num - 1, col_num]
798
+ if col_num == image_col_index: # Image column gets special formatting
799
+ worksheet.write(row_num, col_num, '', image_cell_format) # Empty cell with borders
800
+ else:
801
+ worksheet.write(row_num, col_num, cell_value, wrap_format)
802
+
803
+ if extracted_data:
804
+ output_excel = "product_data_with_images.xlsx"
805
+ create_excel_with_embedded_images(extracted_data, output_excel)
806
+
807
+ # Create summary statistics
808
+ df_results = pd.DataFrame(extracted_data)
809
+ total_items = len(df_results)
810
+ items_with_images = len(df_results[df_results['Product Image File'] != ''])
811
+ unique_products = len(df_results[df_results['Product Code'] != '']['Product Code'].unique())
812
+
813
+ print(f"\n=== FINAL SUMMARY ===")
814
+ print(f"Total items extracted: {total_items}")
815
+ print(f"Items with images: {items_with_images}")
816
+ print(f"Unique products: {unique_products}")
817
+ print(f"Images saved in: {extractor.image_save_dir}")
818
+ print(f"Excel file: {output_excel}")
819
+
820
+ print("Preparing files for download...")
821
+
822
+ # Import the correct files module for Colab
823
+ from google.colab import files as colab_files
824
+
825
+ # Create a zip file with all results
826
+ # zip_filename = "extraction_results.zip"
827
+ # with zipfile.ZipFile(zip_filename, 'w') as zipf:
828
+ # # Add Excel file
829
+ # if os.path.exists("product_data_with_images.xlsx"):
830
+ # zipf.write("product_data_with_images.xlsx")
831
+
832
+ # # Add all extracted images
833
+ # if os.path.exists("extracted_product_images"):
834
+ # for root, dirs, files_list in os.walk("extracted_product_images"):
835
+ # for file in files_list:
836
+ # file_path = os.path.join(root, file)
837
+ # arcname = os.path.relpath(file_path, ".")
838
+ # zipf.write(file_path, arcname)
839
+
840
+ # print(f"Created zip file: {zip_filename}")
841
+
842
+ # # Download the zip file
843
+ # if os.path.exists(zip_filename):
844
+ # colab_files.download(zip_filename)
845
+ # print("Download started! Check your downloads folder.")
846
+ # else:
847
+ # print("Error creating zip file")
848
+
849
+ # Also download Excel separately
850
+ if os.path.exists("product_data_with_images.xlsx"):
851
+ colab_files.download("product_data_with_images.xlsx")
852
+ print("Excel file download started!")
853
+
854
+ print("\nExtraction completed successfully!")
855
+ print("You should now have:")
856
+ print("1. product_data_with_images.xlsx - Excel file with embedded images")
857
+ # print("2. extraction_results.zip - Complete package with all files")
858
+
859
+ def run_quality_check(extracted_data):
860
+ """Run quality checks on extracted data"""
861
+ df = pd.DataFrame(extracted_data)
862
+
863
+ print("=== QUALITY CHECK REPORT ===")
864
+
865
+ # Basic statistics
866
+ print(f"Total records: {len(df)}")
867
+ print(f"Records with Product Code: {len(df[df['Product Code'] != ''])}")
868
+ print(f"Records with Description: {len(df[df['Description'] != ''])}")
869
+ print(f"Records with Images: {len(df[df['Product Image File'] != ''])}")
870
+
871
+ # Product code analysis
872
+ product_codes = df[df['Product Code'] != '']['Product Code'].tolist()
873
+ unique_codes = set(product_codes)
874
+ print(f"Unique Product Codes: {len(unique_codes)}")
875
+
876
+ if product_codes:
877
+ print("Sample Product Codes:", list(unique_codes)[:5])
878
+
879
+ # Image file verification
880
+ image_files = df[df['Product Image File'] != '']['Product Image File'].tolist()
881
+ existing_images = [f for f in image_files if os.path.exists(f)]
882
+ print(f"Image files that exist: {len(existing_images)}/{len(image_files)}")
883
+
884
+ # Manufacturer analysis
885
+ manufacturers = df[df['Manufacturer'] != '']['Manufacturer'].unique()
886
+ print(f"Unique Manufacturers: {len(manufacturers)}")
887
+
888
+ return {
889
+ 'total_records': len(df),
890
+ 'records_with_codes': len(df[df['Product Code'] != '']),
891
+ 'records_with_images': len(df[df['Product Image File'] != '']),
892
+ 'unique_codes': len(unique_codes),
893
+ 'existing_images': len(existing_images)
894
+ }
895
+
896
+ if extracted_data:
897
+ quality_stats = run_quality_check(extracted_data)
898
+
899
+ model_name = "Qwen3_4B_Base_fine_tuned"
900
+ model.save_pretrained(model_name)
901
+ tokenizer.save_pretrained(model_name)
902
+
903
+ model.push_to_hub("pragneshr002/Qwen3_4B_Base_fine_tuned")
904
+
905
+ model.push_to_hub_gguf(model_name, tokenizer, quantization_method="q4_k_m")
906
+