Spaces:

pragnesh002
/

pdf-product-extractor

Runtime error

App Files Files Community

pragnesh002 commited on Sep 23, 2025

Commit

c71e98d

1 Parent(s): c089396

Fix runtime error

Browse files

Files changed (3) hide show

app.py +610 -543
requirements.txt +19 -6
temp_image.py +906 -0

app.py CHANGED Viewed

@@ -1,603 +1,670 @@
-import gradio as gr
-import torch
-import fitz  # PyMuPDF
-import json
-import pandas as pd
-import os
-import re
-import xlsxwriter
-from PIL import Image
-import io
-from collections import defaultdict
-import numpy as np
-import zipfile
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import warnings
-import shutil
-import tempfile
-import gc
-warnings.filterwarnings("ignore")
-# Global variables to store model and tokenizer
-MODEL = None
-TOKENIZER = None
-def load_model_once():
-    """Load GGUF model once and keep in memory - Ultra-optimized for CPU"""
-    global MODEL, TOKENIZER
-    if MODEL is not None and TOKENIZER is not None:
-        print("✅ GGUF Model already loaded in memory")
-        return MODEL, TOKENIZER
-    try:
-        print("🔄 Loading GGUF model (CPU-optimized)...")
-        # GGUF model configurations (choose one)
-        gguf_models = {
-            "q4_k_m": "pragnesh002/Qwen3-4B-Product-Extractor-GGUF-Q4-K-M",  # Recommended
-            # "q5_k_m": "your_username/Qwen3-4B-Product-Extractor-GGUF-Q5-K-M",  # Better quality
-            # "q8_0": "your_username/Qwen3-4B-Product-Extractor-GGUF-Q8",        # High quality
-        }
-        # Use Q4_K_M by default (best balance of speed/quality/size)
-        model_name = gguf_models["q4_k_m"]
-        print(f"Loading GGUF model Change: {model_name}")
-        # Load tokenizer
-        TOKENIZER = AutoTokenizer.from_pretrained(
-            model_name,
-            trust_remote_code=True,
-            use_fast=True  # Use fast tokenizer for better performance
-        )
-        print(f"Loaded GGUF TOKENIZER")
-        # Load GGUF model with CPU optimizations
-        MODEL = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            device_map="cpu",
-            trust_remote_code=True,
-            low_cpu_mem_usage=True,
-            torch_dtype=torch.float32,
-            use_cache=True,
-            cache_dir="/tmp/gguf_cache"
-        )
-        print(f"Loaded GGUF MODEL")
-        # Set to evaluation mode
-        MODEL.eval()
-        # CPU optimizations for GGUF
-        torch.set_num_threads(4)  # Optimal for GGUF models
-        torch.set_num_interop_threads(2)
-        print("✅ GGUF Model loaded successfully on CPU!")
-        print(f"Model type: GGUF Q4_K_M Quantized")
-        print(f"Memory footprint: ~2.5GB (vs ~8GB for full model)")
-        print(f"CPU threads: {torch.get_num_threads()}")
-        return MODEL, TOKENIZER
-    except Exception as e:
-        print(f"❌ Error loading GGUF model: {e}")
-        print("Falling back to regular model loading...")
-        # Fallback to regular model if GGUF fails
-        try:
-            fallback_model = "pragnesh002/Qwen3-4B-Product-Extractor-GGUF-Q4-K-M"
-            TOKENIZER = AutoTokenizer.from_pretrained(fallback_model)
-            MODEL = AutoModelForCausalLM.from_pretrained(
-                fallback_model,
-                device_map="cpu",
-                torch_dtype=torch.float32,
-                low_cpu_mem_usage=True
-            )
-            print("✅ Fallback model loaded")
-            return MODEL, TOKENIZER
-        except:
-            return None, None
-class ProductImageExtractor:
-    def __init__(self):
-        # Create temporary directory for images
-        self.temp_dir = tempfile.mkdtemp(prefix="pdf_extractor_")
-        self.image_save_dir = os.path.join(self.temp_dir, "extracted_product_images")
-        self.model = None
-        self.tokenizer = None
-        self.setup_directories()
-        self.load_model()
-    def load_model(self):
-        """Load the pre-loaded model"""
-        self.model, self.tokenizer = load_model_once()
-        if self.model is None:
-            raise Exception("Failed to load model")
-    def setup_directories(self):
-        """Create necessary directories in temp location"""
-        os.makedirs(self.image_save_dir, exist_ok=True)
-        os.makedirs(f"{self.image_save_dir}/product_images", exist_ok=True)
-        os.makedirs(f"{self.image_save_dir}/non_product_images", exist_ok=True)
-    def cleanup_temp_files(self):
-        """Clean up temporary image files"""
-        try:
-            if os.path.exists(self.temp_dir):
-                shutil.rmtree(self.temp_dir)
-                print(f"🧹 Cleaned up temporary files: {self.temp_dir}")
-        except Exception as e:
-            print(f"Warning: Could not clean up temp files: {e}")
-    def generate_text(self, prompt):
-        """Generate text using the cached model - CPU optimized"""
-        if self.model is None or self.tokenizer is None:
-            return "Error: Model not loaded"
-        try:
-            # CPU-optimized tokenization
-            inputs = self.tokenizer.encode(
-                prompt,
-                return_tensors="pt",
-                truncation=True,
-                max_length=1024  # Limit input length for CPU
-            )
-            # Generate with CPU-optimized settings
-            with torch.no_grad():
-                outputs = self.model.generate(
-                    inputs,
-                    max_new_tokens=512,      # Reduced for CPU
-                    temperature=0.1,
-                    do_sample=False,         # Greedy decoding for CPU (faster)
-                    pad_token_id=self.tokenizer.eos_token_id,
-                    eos_token_id=self.tokenizer.eos_token_id,
-                    use_cache=True,
-                    num_beams=1,            # No beam search (faster on CPU)
-                )
-            # Decode response
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Extract only the generated part
-            prompt_length = len(self.tokenizer.decode(inputs[0], skip_special_tokens=True))
-            response = response[prompt_length:].strip()
-            # Force garbage collection to free memory
-            del inputs, outputs
-            gc.collect()
-            return response
-        except Exception as e:
-            return f"Error in generation: {e}"
-    def is_product_related_image(self, image_bbox, text_blocks, page_text):
-        """Determine if an image is product-related"""
-        product_code_pattern = r'\b[A-Z]{2}-[A-Z]{2}\d+[a-z]?\b'
-        product_codes = re.findall(product_code_pattern, page_text)
-        if not product_codes:
-            return False, None, 0.0
-        product_text_blocks = []
-        for block in text_blocks:
-            if len(block) < 5:
-                continue
-            block_text = block[4]
-            if any(code in block_text for code in product_codes):
-                product_text_blocks.append({
-                    'bbox': block[:4],
-                    'text': block_text,
-                    'codes': [code for code in product_codes if code in block_text]
-                })
-        if not product_text_blocks:
-            return False, None, 0.0
-        max_proximity_score = 0.0
-        closest_product_code = None
-        for block in product_text_blocks:
-            proximity_score = self.calculate_proximity_score(image_bbox, block['bbox'])
-            if proximity_score > max_proximity_score:
-                max_proximity_score = proximity_score
-                closest_product_code = block['codes'][0] if block['codes'] else None
-        is_product = self.additional_filters(image_bbox, max_proximity_score)
-        return is_product, closest_product_code, max_proximity_score
-    def additional_filters(self, image_bbox, max_proximity_score):
-        """Apply additional filters for image classification"""
-        image_area = (image_bbox[2] - image_bbox[0]) * (image_bbox[3] - image_bbox[1])
-        if image_area < 3000:
-            return False
-        page_height = 842
-        if image_bbox[1] < 80 or image_bbox[3] > page_height - 80:
-            return False
-        return max_proximity_score > 0.2
-    def calculate_proximity_score(self, image_bbox, text_bbox):
-        """Calculate proximity score between image and text"""
-        img_center_x = (image_bbox[0] + image_bbox[2]) / 2
-        img_center_y = (image_bbox[1] + image_bbox[3]) / 2
-        text_center_x = (text_bbox[0] + text_bbox[2]) / 2
-        text_center_y = (text_bbox[1] + text_bbox[3]) / 2
-        distance = ((img_center_x - text_center_x) ** 2 + (img_center_y - text_center_y) ** 2) ** 0.5
-        proximity_score = max(0, 1 - (distance / 800))
-        return proximity_score
-    def extract_and_classify_images(self, page, page_num, doc):
-        """Extract and classify images from page"""
-        images = page.get_images(full=True)
-        text_blocks = page.get_text("blocks")
-        page_text = page.get_text()
-        product_images = []
-        for img_index, img_info in enumerate(images):
-            xref = img_info[0]
-            try:
-                image_list = page.get_image_rects(xref)
-                if not image_list:
-                    continue
-                image_bbox = image_list[0]
-                is_product, product_code, proximity_score = self.is_product_related_image(
-                    image_bbox, text_blocks, page_text
-                )
-                if is_product and product_code:
-                    pix = fitz.Pixmap(doc, xref)
-                    if pix.n - pix.alpha > 3:
-                        pix = fitz.Pixmap(fitz.csRGB, pix)
-                    filename = f"page{page_num}_{product_code}_img{img_index+1}.png"
-                    image_path = os.path.join(self.image_save_dir, "product_images", filename)
-                    pix.save(image_path)
-                    image_data = {
-                        'path': image_path,
-                        'product_code': product_code,
-                        'proximity_score': proximity_score
-                    }
-                    product_images.append(image_data)
-                    pix = None
-            except Exception as e:
-                print(f"Error extracting image {img_index+1}: {e}")
-        return product_images
-    def extract_product_data_with_images(self, pdf_file):
-        """Main extraction function with automatic cleanup"""
-        try:
-            doc = fitz.open(pdf_file.name)
-            total_pages = min(doc.page_count, 10)  # Limit to 10 pages for CPU
-            print(f"Processing {total_pages} pages on CPU...")
-        except Exception as e:
-            return None, f"Error opening PDF: {e}"
-        all_product_images = {}
-        product_data_tracker = {}
-        system_prompt = """You are a data extraction assistant.
-Extract the item details from the provided text.
-Provide the output as a JSON object, where each object represents an item and has the following keys: 'Flag', 'Product Code', 'Description', 'Manufacturer', 'Supplier', 'Material', 'Dimensions', and 'Product Image'.
-If a key's value is not found in the text for an item, provide an empty string "".
-If no items are found, return an empty JSON [].
-Do not include any extra text or formatting outside the JSON object.
-Include rows with unique Product Code values only."""
-        for page_num in range(total_pages):
-            page = doc.load_page(page_num)
-            page_text = page.get_text()
-            if len(page_text.strip()) < 50:  # Skip mostly empty pages
-                continue
-            print(f"Processing page {page_num + 1}...")
-            # Extract images
-            product_images = self.extract_and_classify_images(page, page_num + 1, doc)
-            for img_data in product_images:
-                if img_data['product_code']:
-                    if img_data['product_code'] not in all_product_images:
-                        all_product_images[img_data['product_code']] = []
-                    all_product_images[img_data['product_code']].append(img_data)
-            # Extract text data (CPU-optimized processing)
-            prompt = f"{system_prompt}\n\nText:\n{page_text[:2000]}\n\nOutput JSON:"  # Limit text length
-            raw_output = self.generate_text(prompt)
-            try:
-                # Parse JSON response
-                json_start = raw_output.find('[')
-                json_end = raw_output.rfind(']') + 1
-                if json_start != -1 and json_end != 0:
-                    json_str = raw_output[json_start:json_end]
-                else:
-                    json_str = raw_output.strip()
-                parsed_data = json.loads(json_str)
-                if isinstance(parsed_data, dict):
-                    parsed_data = [parsed_data]
-                elif not isinstance(parsed_data, list):
-                    parsed_data = []
-                for item in parsed_data:
-                    if isinstance(item, dict):
-                        product_code = item.get('Product Code', '').strip()
-                        if not product_code:
-                            continue
-                        # Find best matching image
-                        image_path = ""
-                        if product_code in all_product_images:
-                            best_image = max(all_product_images[product_code],
-                                           key=lambda x: x['proximity_score'])
-                            image_path = best_image['path']
-                        current_item_data = {
-                            "pdf_page_number": page_num + 1,
-                            "Flag": item.get('Flag', ''),
-                            "Product Code": product_code,
-                            "Description": item.get('Description', ''),
-                            "Manufacturer": item.get('Manufacturer', ''),
-                            "Supplier": item.get('Supplier', ''),
-                            "Material": item.get('Material', ''),
-                            "Dimensions": item.get('Dimensions', ''),
-                            "Product Image": item.get('Product Image', ''),
-                            "Product Image File": image_path,
-                        }
-                        if product_code not in product_data_tracker:
-                            product_data_tracker[product_code] = current_item_data
-            except Exception as e:
-                print(f"Error processing page {page_num + 1}: {e}")
-        doc.close()
-        final_data = list(product_data_tracker.values())
-        return final_data, None
-def create_excel_with_images_and_cleanup(data, extractor, output_filename="product_data_with_images.xlsx"):
-    """Create Excel file with embedded images, then clean up image files"""
-    if not data:
-        return None
-    df = pd.DataFrame(data)
-    try:
-        with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
-            df.to_excel(writer, sheet_name='Product Data', index=False)
-            workbook = writer.book
-            worksheet = writer.sheets['Product Data']
-            # Set column widths
-            for col_idx, column_name in enumerate(df.columns):
-                if column_name == "Product Image":
-                    worksheet.set_column(col_idx, col_idx, 20)
-                elif column_name in ["Description", "Material"]:
-                    worksheet.set_column(col_idx, col_idx, 30)
-                else:
-                    worksheet.set_column(col_idx, col_idx, 15)
-            # Add header formatting
-            header_format = workbook.add_format({
-                'bold': True,
-                'text_wrap': True,
-                'valign': 'top',
-                'fg_color': '#D7E4BC',
-                'border': 1
-            })
-            for col_num, value in enumerate(df.columns.values):
-                worksheet.write(0, col_num, value, header_format)
-            # Embed images
-            try:
-                image_col_index = df.columns.get_loc("Product Image")
-                for row_num in range(1, len(df) + 1):
-                    image_path = df.iloc[row_num - 1]['Product Image File']
-                    if image_path and os.path.exists(image_path):
-                        try:
-                            worksheet.set_row(row_num, 80)
-                            worksheet.insert_image(
-                                row_num, image_col_index, image_path,
-                                {'x_scale': 0.2, 'y_scale': 0.2, 'x_offset': 5, 'y_offset': 5}
-                            )
-                        except Exception as e:
-                            print(f"Error embedding image: {e}")
-            except KeyError:
-                pass
-        print("✅ Excel file created successfully")
-        # Now clean up temporary image files
-        extractor.cleanup_temp_files()
-        print("🧹 Temporary image files cleaned up")
-        # Also clean up the "Product Image File" column data to show cleanup
-        df_clean = df.copy()
-        df_clean['Product Image File'] = df_clean['Product Image File'].apply(
-            lambda x: "✅ Embedded in Excel (temp files cleaned)" if x else ""
-        )
-        return output_filename, df_clean
-    except Exception as e:
-        print(f"Error creating Excel: {e}")
-        # Still try to cleanup on error
-        extractor.cleanup_temp_files()
-        return None, df
-def process_pdf(pdf_file, progress=gr.Progress()):
-    """Main processing function with automatic cleanup"""
-    if pdf_file is None:
-        return "Please upload a PDF file", None, None
-    progress(0.1, desc="Initializing CPU-optimized extractor...")
-    extractor = None
-    try:
-        extractor = ProductImageExtractor()
-    except Exception as e:
-        return f"Error initializing extractor: {e}", None, None
-    progress(0.3, desc="Extracting data from PDF (CPU mode - may take 2-3 minutes)...")
-    extracted_data, error = extractor.extract_product_data_with_images(pdf_file)
-    if error:
-        if extractor:
-            extractor.cleanup_temp_files()
-        return f"Error: {error}", None, None
-    if not extracted_data:
-        if extractor:
-            extractor.cleanup_temp_files()
-        return "No product data found in the PDF", None, None
-    progress(0.7, desc="Creating Excel file and embedding images...")
-    excel_file, df_clean = create_excel_with_images_and_cleanup(extracted_data, extractor)
-    if excel_file is None:
-        return "Error creating Excel file", pd.DataFrame(extracted_data), None
-    progress(0.9, desc="Finalizing and cleaning up...")
-    summary = f"""
-**✅ Extraction Completed Successfully!**
-**📊 Results:**
-- **Total items extracted:** {len(df_clean)}
-- **Items with product codes:** {len(df_clean[df_clean['Product Code'] != ''])}
-- **Items with images:** {len([x for x in extracted_data if x['Product Image File']])}
-- **Unique products:** {len(df_clean[df_clean['Product Code'] != '']['Product Code'].unique()) if len(df_clean[df_clean['Product Code'] != '']) > 0 else 0}
-**💻 CPU Processing:**
-- **Mode:** CPU-optimized inference
-- **Pages processed:** {df_clean['pdf_page_number'].max() if 'pdf_page_number' in df_clean.columns else 'N/A'}
-- **Images:** Embedded in Excel, temporary files cleaned up ✅
-**📥 Ready for Download!**
-"""
-    progress(1.0, desc="Complete!")
-    return summary, df_clean, excel_file
-# Pre-load the model
-print("🚀 Initializing PDF Product Extractor (CPU Mode)...")
-print("Loading model into memory...")
-model, tokenizer = load_model_once()
-if model is None:
-    print("❌ Failed to load model during startup")
-else:
-    print("✅ Model successfully loaded and cached on CPU!")
-# Create Gradio interface
-with gr.Blocks(
-    title="PDF Product Data Extractor - CPU Optimized",
-    theme=gr.themes.Soft(),
-) as demo:
-    gr.HTML("""
-    <div style="text-align: center; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; padding: 2rem; border-radius: 10px; margin-bottom: 2rem;">
-        <h1>📄 PDF Product Data Extractor</h1>
-        <p>🖥️ CPU-Optimized | 🧹 Auto-Cleanup | 📊 Memory Efficient</p>
-    </div>
-    """)
-    gr.Markdown("""
-    ### ⚡ **CPU-Optimized Features:**
-    - **No GPU Required**: Runs efficiently on CPU-only environments
-    - **Memory Efficient**: Automatic cleanup of temporary files
-    - **Cost Effective**: Perfect for free Hugging Face Spaces
-    - **Smart Processing**: Limited to 10 pages for optimal performance
-    ### 🧹 **Automatic Cleanup:**
-    - Images are temporarily extracted for processing
-    - Embedded into Excel file during creation
-    - All temporary image files automatically deleted
-    - Keeps only the final Excel with embedded images
-    """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            pdf_input = gr.File(
-                label="📁 Upload PDF File",
-                file_types=[".pdf"],
-                file_count="single",
-                height=120
-            )
-            extract_btn = gr.Button(
-                "🔍 Extract Product Data (CPU Mode)",
-                variant="primary",
-                size="lg"
-            )
-            gr.Markdown("""
-            **💡 CPU Mode Notes:**
-            - Processing takes 2-3 minutes (vs 30 seconds on GPU)
-            - Limited to 10 pages per PDF
-            - Uses 4 CPU threads for stability
-            - Temporary files auto-cleaned after Excel creation
-            """)
-        with gr.Column(scale=2):
-            status_output = gr.Markdown(
-                value="🖥️ CPU mode ready. Upload your PDF to begin processing..."
-            )
-    with gr.Row():
-        with gr.Column():
-            data_output = gr.Dataframe(
-                label="📋 Extracted Product Data",
-                wrap=True,
-                interactive=False
-            )
-        with gr.Column():
-            excel_output = gr.File(
-                label="📥 Download Excel File",
-                file_count="single"
-            )
-    extract_btn.click(
-        fn=process_pdf,
-        inputs=[pdf_input],
-        outputs=[status_output, data_output, excel_output],
-        show_progress=True
-    )
-    gr.Markdown("""
-    ---
-    **🔧 Technical Details:**
-    - **Model**: Fine-tuned Qwen3-4B (CPU-optimized)
-    - **Processing**: torch.float32, greedy decoding
-    - **Memory**: Auto garbage collection, temp file cleanup
-    - **Threads**: Limited to 4 CPU threads for stability
-    **🧹 Cleanup Process:**
-    1. Images extracted to temporary directory
-    2. Data processed and Excel created with embedded images
-    3. Temporary image files automatically deleted
-    4. Only final Excel file retained with embedded images
-    """)
 if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_error=True
-    )

+# import gradio as gr
+# import torch
+# import fitz  # PyMuPDF
+# import json
+# import pandas as pd
+# import os
+# import re
+# import xlsxwriter
+# from PIL import Image
+# import io
+# from collections import defaultdict
+# import numpy as np
+# import zipfile
+# from transformers import AutoTokenizer, AutoModelForCausalLM
+# import warnings
+# import shutil
+# import tempfile
+# import gc
+# warnings.filterwarnings("ignore")
+# # Global variables to store model and tokenizer
+# MODEL = None
+# TOKENIZER = None
+# def load_model_once():
+#     """Load GGUF model once and keep in memory - Ultra-optimized for CPU"""
+#     global MODEL, TOKENIZER
+#     if MODEL is not None and TOKENIZER is not None:
+#         print("✅ GGUF Model already loaded in memory")
+#         return MODEL, TOKENIZER
+#     try:
+#         print("🔄 Loading GGUF model (CPU-optimized)...")
+#         # GGUF model configurations (choose one)
+#         gguf_models = {
+#             "q4_k_m": "pragnesh002/Qwen3-4B-Product-Extractor-GGUF-Q4-K-M",  # Recommended
+#             # "q5_k_m": "your_username/Qwen3-4B-Product-Extractor-GGUF-Q5-K-M",  # Better quality
+#             # "q8_0": "your_username/Qwen3-4B-Product-Extractor-GGUF-Q8",        # High quality
+#         }
+#         # Use Q4_K_M by default (best balance of speed/quality/size)
+#         model_name = gguf_models["q4_k_m"]
+#         print(f"Loading GGUF model Change: {model_name}")
+#         # Load tokenizer
+#         TOKENIZER = AutoTokenizer.from_pretrained(
+#             model_name,
+#             trust_remote_code=True,
+#             use_fast=True  # Use fast tokenizer for better performance
+#         )
+#         print(f"Loaded GGUF TOKENIZER")
+#         # Load GGUF model with CPU optimizations
+#         MODEL = AutoModelForCausalLM.from_pretrained(
+#             model_name,
+#             device_map="cpu",
+#             trust_remote_code=True,
+#             low_cpu_mem_usage=True,
+#             torch_dtype=torch.float32,
+#             use_cache=True,
+#             cache_dir="/tmp/gguf_cache"
+#         )
+#         print(f"Loaded GGUF MODEL")
+#         # Set to evaluation mode
+#         MODEL.eval()
+#         # CPU optimizations for GGUF
+#         torch.set_num_threads(4)  # Optimal for GGUF models
+#         torch.set_num_interop_threads(2)
+#         print("✅ GGUF Model loaded successfully on CPU!")
+#         print(f"Model type: GGUF Q4_K_M Quantized")
+#         print(f"Memory footprint: ~2.5GB (vs ~8GB for full model)")
+#         print(f"CPU threads: {torch.get_num_threads()}")
+#         return MODEL, TOKENIZER
+#     except Exception as e:
+#         print(f"❌ Error loading GGUF model: {e}")
+#         print("Falling back to regular model loading...")
+#         # Fallback to regular model if GGUF fails
+#         try:
+#             fallback_model = "pragnesh002/Qwen3-4B-Product-Extractor-GGUF-Q4-K-M"
+#             TOKENIZER = AutoTokenizer.from_pretrained(fallback_model)
+#             MODEL = AutoModelForCausalLM.from_pretrained(
+#                 fallback_model,
+#                 device_map="cpu",
+#                 torch_dtype=torch.float32,
+#                 low_cpu_mem_usage=True
+#             )
+#             print("✅ Fallback model loaded")
+#             return MODEL, TOKENIZER
+#         except:
+#             return None, None
+# class ProductImageExtractor:
+#     def __init__(self):
+#         # Create temporary directory for images
+#         self.temp_dir = tempfile.mkdtemp(prefix="pdf_extractor_")
+#         self.image_save_dir = os.path.join(self.temp_dir, "extracted_product_images")
+#         self.model = None
+#         self.tokenizer = None
+#         self.setup_directories()
+#         self.load_model()
+#     def load_model(self):
+#         """Load the pre-loaded model"""
+#         self.model, self.tokenizer = load_model_once()
+#         if self.model is None:
+#             raise Exception("Failed to load model")
+#     def setup_directories(self):
+#         """Create necessary directories in temp location"""
+#         os.makedirs(self.image_save_dir, exist_ok=True)
+#         os.makedirs(f"{self.image_save_dir}/product_images", exist_ok=True)
+#         os.makedirs(f"{self.image_save_dir}/non_product_images", exist_ok=True)
+#     def cleanup_temp_files(self):
+#         """Clean up temporary image files"""
+#         try:
+#             if os.path.exists(self.temp_dir):
+#                 shutil.rmtree(self.temp_dir)
+#                 print(f"🧹 Cleaned up temporary files: {self.temp_dir}")
+#         except Exception as e:
+#             print(f"Warning: Could not clean up temp files: {e}")
+#     def generate_text(self, prompt):
+#         """Generate text using the cached model - CPU optimized"""
+#         if self.model is None or self.tokenizer is None:
+#             return "Error: Model not loaded"
+#         try:
+#             # CPU-optimized tokenization
+#             inputs = self.tokenizer.encode(
+#                 prompt,
+#                 return_tensors="pt",
+#                 truncation=True,
+#                 max_length=1024  # Limit input length for CPU
+#             )
+#             # Generate with CPU-optimized settings
+#             with torch.no_grad():
+#                 outputs = self.model.generate(
+#                     inputs,
+#                     max_new_tokens=512,      # Reduced for CPU
+#                     temperature=0.1,
+#                     do_sample=False,         # Greedy decoding for CPU (faster)
+#                     pad_token_id=self.tokenizer.eos_token_id,
+#                     eos_token_id=self.tokenizer.eos_token_id,
+#                     use_cache=True,
+#                     num_beams=1,            # No beam search (faster on CPU)
+#                 )
+#             # Decode response
+#             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+#             # Extract only the generated part
+#             prompt_length = len(self.tokenizer.decode(inputs[0], skip_special_tokens=True))
+#             response = response[prompt_length:].strip()
+#             # Force garbage collection to free memory
+#             del inputs, outputs
+#             gc.collect()
+#             return response
+#         except Exception as e:
+#             return f"Error in generation: {e}"
+#     def is_product_related_image(self, image_bbox, text_blocks, page_text):
+#         """Determine if an image is product-related"""
+#         product_code_pattern = r'\b[A-Z]{2}-[A-Z]{2}\d+[a-z]?\b'
+#         product_codes = re.findall(product_code_pattern, page_text)
+#         if not product_codes:
+#             return False, None, 0.0
+#         product_text_blocks = []
+#         for block in text_blocks:
+#             if len(block) < 5:
+#                 continue
+#             block_text = block[4]
+#             if any(code in block_text for code in product_codes):
+#                 product_text_blocks.append({
+#                     'bbox': block[:4],
+#                     'text': block_text,
+#                     'codes': [code for code in product_codes if code in block_text]
+#                 })
+#         if not product_text_blocks:
+#             return False, None, 0.0
+#         max_proximity_score = 0.0
+#         closest_product_code = None
+#         for block in product_text_blocks:
+#             proximity_score = self.calculate_proximity_score(image_bbox, block['bbox'])
+#             if proximity_score > max_proximity_score:
+#                 max_proximity_score = proximity_score
+#                 closest_product_code = block['codes'][0] if block['codes'] else None
+#         is_product = self.additional_filters(image_bbox, max_proximity_score)
+#         return is_product, closest_product_code, max_proximity_score
+#     def additional_filters(self, image_bbox, max_proximity_score):
+#         """Apply additional filters for image classification"""
+#         image_area = (image_bbox[2] - image_bbox[0]) * (image_bbox[3] - image_bbox[1])
+#         if image_area < 3000:
+#             return False
+#         page_height = 842
+#         if image_bbox[1] < 80 or image_bbox[3] > page_height - 80:
+#             return False
+#         return max_proximity_score > 0.2
+#     def calculate_proximity_score(self, image_bbox, text_bbox):
+#         """Calculate proximity score between image and text"""
+#         img_center_x = (image_bbox[0] + image_bbox[2]) / 2
+#         img_center_y = (image_bbox[1] + image_bbox[3]) / 2
+#         text_center_x = (text_bbox[0] + text_bbox[2]) / 2
+#         text_center_y = (text_bbox[1] + text_bbox[3]) / 2
+#         distance = ((img_center_x - text_center_x) ** 2 + (img_center_y - text_center_y) ** 2) ** 0.5
+#         proximity_score = max(0, 1 - (distance / 800))
+#         return proximity_score
+#     def extract_and_classify_images(self, page, page_num, doc):
+#         """Extract and classify images from page"""
+#         images = page.get_images(full=True)
+#         text_blocks = page.get_text("blocks")
+#         page_text = page.get_text()
+#         product_images = []
+#         for img_index, img_info in enumerate(images):
+#             xref = img_info[0]
+#             try:
+#                 image_list = page.get_image_rects(xref)
+#                 if not image_list:
+#                     continue
+#                 image_bbox = image_list[0]
+#                 is_product, product_code, proximity_score = self.is_product_related_image(
+#                     image_bbox, text_blocks, page_text
+#                 )
+#                 if is_product and product_code:
+#                     pix = fitz.Pixmap(doc, xref)
+#                     if pix.n - pix.alpha > 3:
+#                         pix = fitz.Pixmap(fitz.csRGB, pix)
+#                     filename = f"page{page_num}_{product_code}_img{img_index+1}.png"
+#                     image_path = os.path.join(self.image_save_dir, "product_images", filename)
+#                     pix.save(image_path)
+#                     image_data = {
+#                         'path': image_path,
+#                         'product_code': product_code,
+#                         'proximity_score': proximity_score
+#                     }
+#                     product_images.append(image_data)
+#                     pix = None
+#             except Exception as e:
+#                 print(f"Error extracting image {img_index+1}: {e}")
+#         return product_images
+#     def extract_product_data_with_images(self, pdf_file):
+#         """Main extraction function with automatic cleanup"""
+#         try:
+#             doc = fitz.open(pdf_file.name)
+#             total_pages = min(doc.page_count, 10)  # Limit to 10 pages for CPU
+#             print(f"Processing {total_pages} pages on CPU...")
+#         except Exception as e:
+#             return None, f"Error opening PDF: {e}"
+#         all_product_images = {}
+#         product_data_tracker = {}
+#         system_prompt = """You are a data extraction assistant.
+# Extract the item details from the provided text.
+# Provide the output as a JSON object, where each object represents an item and has the following keys: 'Flag', 'Product Code', 'Description', 'Manufacturer', 'Supplier', 'Material', 'Dimensions', and 'Product Image'.
+# If a key's value is not found in the text for an item, provide an empty string "".
+# If no items are found, return an empty JSON [].
+# Do not include any extra text or formatting outside the JSON object.
+# Include rows with unique Product Code values only."""
+#         for page_num in range(total_pages):
+#             page = doc.load_page(page_num)
+#             page_text = page.get_text()
+#             if len(page_text.strip()) < 50:  # Skip mostly empty pages
+#                 continue
+#             print(f"Processing page {page_num + 1}...")
+#             # Extract images
+#             product_images = self.extract_and_classify_images(page, page_num + 1, doc)
+#             for img_data in product_images:
+#                 if img_data['product_code']:
+#                     if img_data['product_code'] not in all_product_images:
+#                         all_product_images[img_data['product_code']] = []
+#                     all_product_images[img_data['product_code']].append(img_data)
+#             # Extract text data (CPU-optimized processing)
+#             prompt = f"{system_prompt}\n\nText:\n{page_text[:2000]}\n\nOutput JSON:"  # Limit text length
+#             raw_output = self.generate_text(prompt)
+#             try:
+#                 # Parse JSON response
+#                 json_start = raw_output.find('[')
+#                 json_end = raw_output.rfind(']') + 1
+#                 if json_start != -1 and json_end != 0:
+#                     json_str = raw_output[json_start:json_end]
+#                 else:
+#                     json_str = raw_output.strip()
+#                 parsed_data = json.loads(json_str)
+#                 if isinstance(parsed_data, dict):
+#                     parsed_data = [parsed_data]
+#                 elif not isinstance(parsed_data, list):
+#                     parsed_data = []
+#                 for item in parsed_data:
+#                     if isinstance(item, dict):
+#                         product_code = item.get('Product Code', '').strip()
+#                         if not product_code:
+#                             continue
+#                         # Find best matching image
+#                         image_path = ""
+#                         if product_code in all_product_images:
+#                             best_image = max(all_product_images[product_code],
+#                                            key=lambda x: x['proximity_score'])
+#                             image_path = best_image['path']
+#                         current_item_data = {
+#                             "pdf_page_number": page_num + 1,
+#                             "Flag": item.get('Flag', ''),
+#                             "Product Code": product_code,
+#                             "Description": item.get('Description', ''),
+#                             "Manufacturer": item.get('Manufacturer', ''),
+#                             "Supplier": item.get('Supplier', ''),
+#                             "Material": item.get('Material', ''),
+#                             "Dimensions": item.get('Dimensions', ''),
+#                             "Product Image": item.get('Product Image', ''),
+#                             "Product Image File": image_path,
+#                         }
+#                         if product_code not in product_data_tracker:
+#                             product_data_tracker[product_code] = current_item_data
+#             except Exception as e:
+#                 print(f"Error processing page {page_num + 1}: {e}")
+#         doc.close()
+#         final_data = list(product_data_tracker.values())
+#         return final_data, None
+# def create_excel_with_images_and_cleanup(data, extractor, output_filename="product_data_with_images.xlsx"):
+#     """Create Excel file with embedded images, then clean up image files"""
+#     if not data:
+#         return None
+#     df = pd.DataFrame(data)
+#     try:
+#         with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
+#             df.to_excel(writer, sheet_name='Product Data', index=False)
+#             workbook = writer.book
+#             worksheet = writer.sheets['Product Data']
+#             # Set column widths
+#             for col_idx, column_name in enumerate(df.columns):
+#                 if column_name == "Product Image":
+#                     worksheet.set_column(col_idx, col_idx, 20)
+#                 elif column_name in ["Description", "Material"]:
+#                     worksheet.set_column(col_idx, col_idx, 30)
+#                 else:
+#                     worksheet.set_column(col_idx, col_idx, 15)
+#             # Add header formatting
+#             header_format = workbook.add_format({
+#                 'bold': True,
+#                 'text_wrap': True,
+#                 'valign': 'top',
+#                 'fg_color': '#D7E4BC',
+#                 'border': 1
+#             })
+#             for col_num, value in enumerate(df.columns.values):
+#                 worksheet.write(0, col_num, value, header_format)
+#             # Embed images
+#             try:
+#                 image_col_index = df.columns.get_loc("Product Image")
+#                 for row_num in range(1, len(df) + 1):
+#                     image_path = df.iloc[row_num - 1]['Product Image File']
+#                     if image_path and os.path.exists(image_path):
+#                         try:
+#                             worksheet.set_row(row_num, 80)
+#                             worksheet.insert_image(
+#                                 row_num, image_col_index, image_path,
+#                                 {'x_scale': 0.2, 'y_scale': 0.2, 'x_offset': 5, 'y_offset': 5}
+#                             )
+#                         except Exception as e:
+#                             print(f"Error embedding image: {e}")
+#             except KeyError:
+#                 pass
+#         print("✅ Excel file created successfully")
+#         # Now clean up temporary image files
+#         extractor.cleanup_temp_files()
+#         print("🧹 Temporary image files cleaned up")
+#         # Also clean up the "Product Image File" column data to show cleanup
+#         df_clean = df.copy()
+#         df_clean['Product Image File'] = df_clean['Product Image File'].apply(
+#             lambda x: "✅ Embedded in Excel (temp files cleaned)" if x else ""
+#         )
+#         return output_filename, df_clean
+#     except Exception as e:
+#         print(f"Error creating Excel: {e}")
+#         # Still try to cleanup on error
+#         extractor.cleanup_temp_files()
+#         return None, df
+# def process_pdf(pdf_file, progress=gr.Progress()):
+#     """Main processing function with automatic cleanup"""
+#     if pdf_file is None:
+#         return "Please upload a PDF file", None, None
+#     progress(0.1, desc="Initializing CPU-optimized extractor...")
+#     extractor = None
+#     try:
+#         extractor = ProductImageExtractor()
+#     except Exception as e:
+#         return f"Error initializing extractor: {e}", None, None
+#     progress(0.3, desc="Extracting data from PDF (CPU mode - may take 2-3 minutes)...")
+#     extracted_data, error = extractor.extract_product_data_with_images(pdf_file)
+#     if error:
+#         if extractor:
+#             extractor.cleanup_temp_files()
+#         return f"Error: {error}", None, None
+#     if not extracted_data:
+#         if extractor:
+#             extractor.cleanup_temp_files()
+#         return "No product data found in the PDF", None, None
+#     progress(0.7, desc="Creating Excel file and embedding images...")
+#     excel_file, df_clean = create_excel_with_images_and_cleanup(extracted_data, extractor)
+#     if excel_file is None:
+#         return "Error creating Excel file", pd.DataFrame(extracted_data), None
+#     progress(0.9, desc="Finalizing and cleaning up...")
+#     summary = f"""
+# **✅ Extraction Completed Successfully!**
+# **📊 Results:**
+# - **Total items extracted:** {len(df_clean)}
+# - **Items with product codes:** {len(df_clean[df_clean['Product Code'] != ''])}
+# - **Items with images:** {len([x for x in extracted_data if x['Product Image File']])}
+# - **Unique products:** {len(df_clean[df_clean['Product Code'] != '']['Product Code'].unique()) if len(df_clean[df_clean['Product Code'] != '']) > 0 else 0}
+# **💻 CPU Processing:**
+# - **Mode:** CPU-optimized inference
+# - **Pages processed:** {df_clean['pdf_page_number'].max() if 'pdf_page_number' in df_clean.columns else 'N/A'}
+# - **Images:** Embedded in Excel, temporary files cleaned up ✅
+# **📥 Ready for Download!**
+# """
+#     progress(1.0, desc="Complete!")
+#     return summary, df_clean, excel_file
+# # Pre-load the model
+# print("🚀 Initializing PDF Product Extractor (CPU Mode)...")
+# print("Loading model into memory...")
+# model, tokenizer = load_model_once()
+# if model is None:
+#     print("❌ Failed to load model during startup")
+# else:
+#     print("✅ Model successfully loaded and cached on CPU!")
+# # Create Gradio interface
+# with gr.Blocks(
+#     title="PDF Product Data Extractor - CPU Optimized",
+#     theme=gr.themes.Soft(),
+# ) as demo:
+#     gr.HTML("""
+#     <div style="text-align: center; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; padding: 2rem; border-radius: 10px; margin-bottom: 2rem;">
+#         <h1>📄 PDF Product Data Extractor</h1>
+#         <p>🖥️ CPU-Optimized | 🧹 Auto-Cleanup | 📊 Memory Efficient</p>
+#     </div>
+#     """)
+#     gr.Markdown("""
+#     ### ⚡ **CPU-Optimized Features:**
+#     - **No GPU Required**: Runs efficiently on CPU-only environments
+#     - **Memory Efficient**: Automatic cleanup of temporary files
+#     - **Cost Effective**: Perfect for free Hugging Face Spaces
+#     - **Smart Processing**: Limited to 10 pages for optimal performance
+#     ### 🧹 **Automatic Cleanup:**
+#     - Images are temporarily extracted for processing
+#     - Embedded into Excel file during creation
+#     - All temporary image files automatically deleted
+#     - Keeps only the final Excel with embedded images
+#     """)
+#     with gr.Row():
+#         with gr.Column(scale=1):
+#             pdf_input = gr.File(
+#                 label="📁 Upload PDF File",
+#                 file_types=[".pdf"],
+#                 file_count="single",
+#                 height=120
+#             )
+#             extract_btn = gr.Button(
+#                 "🔍 Extract Product Data (CPU Mode)",
+#                 variant="primary",
+#                 size="lg"
+#             )
+#             gr.Markdown("""
+#             **💡 CPU Mode Notes:**
+#             - Processing takes 2-3 minutes (vs 30 seconds on GPU)
+#             - Limited to 10 pages per PDF
+#             - Uses 4 CPU threads for stability
+#             - Temporary files auto-cleaned after Excel creation
+#             """)
+#         with gr.Column(scale=2):
+#             status_output = gr.Markdown(
+#                 value="🖥️ CPU mode ready. Upload your PDF to begin processing..."
+#             )
+#     with gr.Row():
+#         with gr.Column():
+#             data_output = gr.Dataframe(
+#                 label="📋 Extracted Product Data",
+#                 wrap=True,
+#                 interactive=False
+#             )
+#         with gr.Column():
+#             excel_output = gr.File(
+#                 label="📥 Download Excel File",
+#                 file_count="single"
+#             )
+#     extract_btn.click(
+#         fn=process_pdf,
+#         inputs=[pdf_input],
+#         outputs=[status_output, data_output, excel_output],
+#         show_progress=True
+#     )
+#     gr.Markdown("""
+#     ---
+#     **🔧 Technical Details:**
+#     - **Model**: Fine-tuned Qwen3-4B (CPU-optimized)
+#     - **Processing**: torch.float32, greedy decoding
+#     - **Memory**: Auto garbage collection, temp file cleanup
+#     - **Threads**: Limited to 4 CPU threads for stability
+#     **🧹 Cleanup Process:**
+#     1. Images extracted to temporary directory
+#     2. Data processed and Excel created with embedded images
+#     3. Temporary image files automatically deleted
+#     4. Only final Excel file retained with embedded images
+#     """)
+# if __name__ == "__main__":
+#     demo.launch(
+#         server_name="0.0.0.0",
+#         server_port=7860,
+#         share=False,
+#         show_error=True
+#     )
+import gradio as gr
+import os
+import shutil
+import pandas as pd
+from unsloth import FastLanguageModel
+from temp_image import ProductImageExtractor, create_excel_with_embedded_images
+# -------------------------------
+# Load model once at startup
+# -------------------------------
+print("🚀 Loading fine-tuned model...")
+model, tokenizer = FastLanguageModel.from_pretrained(
+    "pragnesh002/Qwen3-4B-Product-Extractor-GGUF-Q4-K-M",
+    max_seq_length=2048,
+    load_in_4bit=True,
+    fast_inference=True,
+)
+print("✅ Model loaded successfully!")
+# -------------------------------
+# PDF → Excel processing function
+# -------------------------------
+def process_pdf(pdf_file):
+    if not pdf_file or not pdf_file.name.endswith(".pdf"):
+        return "❌ Please upload a valid PDF file.", None
+    pdf_path = pdf_file.name
+    extractor = ProductImageExtractor(pdf_path, model, tokenizer)
+    # Extract product data
+    extracted_data = extractor.extract_product_data_with_images()
+    if not extracted_data:
+        return "⚠️ No product data extracted.", None
+    # Generate Excel file
+    output_excel = "product_data.xlsx"
+    create_excel_with_embedded_images(extracted_data, output_excel)
+    # Remove extracted images folder
+    if os.path.exists(extractor.image_save_dir):
+        shutil.rmtree(extractor.image_save_dir, ignore_errors=True)
+    return f"✅ Extraction complete. {len(extracted_data)} products found.", output_excel
+# -------------------------------
+# Gradio UI
+# -------------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("## 📑 PDF → Excel Product Extractor (Qwen3-4B Fine-tuned)")
+    gr.Markdown("Upload a PDF → extract structured product data into Excel → auto-remove images after generation.")
+    with gr.Row():
+        pdf_input = gr.File(label="Upload PDF", type="file", file_types=[".pdf"])
+        run_btn = gr.Button("Extract to Excel")
+    status = gr.Textbox(label="Status", interactive=False)
+    excel_output = gr.File(label="Download Excel")
+    run_btn.click(process_pdf, inputs=pdf_input, outputs=[status, excel_output])
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt CHANGED Viewed

@@ -1,13 +1,26 @@
 gradio==5.18
 torch
-transformers
 PyMuPDF
 pandas
 xlsxwriter
 Pillow
 numpy
-accelerate
-sentencepiece
-huggingface_hub
-psutil
-websockets

 gradio==5.18
 torch
+transformers==4.55.4
+accelerate
+sentencepiece
+huggingface_hub
+psutil
+websockets
+# PDF & Excel handling
 PyMuPDF
 pandas
 xlsxwriter
 Pillow
 numpy
+# Unsloth & related
+unsloth
+vllm==0.10.1
+triton==3.2.0
+bitsandbytes
+xformers
+# Training helpers
+trl
+datasets

temp_image.py ADDED Viewed

	@@ -0,0 +1,906 @@

+# -*- coding: utf-8 -*-
+"""temp_image.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1g_LpdbYLQ7dGmAzUiG2X2gPQUsPkDN1D
+"""
+# Commented out IPython magic to ensure Python compatibility.
+# %%capture
+# import os
+# os.environ["UNSLOTH_VLLM_STANDBY"] = "1"
+#
+# # Install packages for Colab
+# !pip install --upgrade -qqq uv
+# try:
+#     import numpy; get_numpy = f"numpy=={numpy.__version__}"
+# except:
+#     get_numpy = "numpy"
+#
+# try:
+#     import subprocess; is_t4 = "Tesla T4" in str(subprocess.check_output(["nvidia-smi"]))
+# except:
+#     is_t4 = False
+#
+# get_vllm, get_triton = ("vllm==0.10.1", "triton==3.2.0") if is_t4 else ("vllm", "triton")
+#
+# !uv pip install -qqq --upgrade \
+#     unsloth {get_vllm} {get_numpy} torchvision bitsandbytes xformers
+# !uv pip install -qqq {get_triton}
+# !uv pip install transformers==4.55.4
+# !uv pip install PyMuPDF xlsxwriter pillow
+#
+# print("All packages installed successfully!")
+from unsloth import FastLanguageModel
+import torch
+import fitz  # PyMuPDF
+import json
+import pandas as pd
+import os
+import re
+import xlsxwriter
+from PIL import Image, ImageDraw
+import io
+from collections import defaultdict
+from vllm import SamplingParams
+from trl import GRPOConfig, GRPOTrainer
+from datasets import Dataset
+import numpy as np
+from google.colab import files
+import zipfile
+import matplotlib.pyplot as plt
+# Model configuration
+max_seq_length = 2048
+lora_rank = 32
+print("Loading model...")
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="unsloth/Qwen3-4B-Base",
+    max_seq_length=max_seq_length,
+    load_in_4bit=False,
+    fast_inference=True,
+    max_lora_rank=lora_rank,
+    gpu_memory_utilization=0.7,
+)
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=lora_rank,
+    target_modules=[
+        "q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj",
+    ],
+    lora_alpha=lora_rank*2,
+    use_gradient_checkpointing="unsloth",
+    random_state=3407,
+)
+print("Model loaded successfully!")
+print("Please upload your PDF file:")
+uploaded = files.upload()
+# Get the uploaded file name
+pdf_file_path = list(uploaded.keys())[0]
+print(f"Uploaded file: {pdf_file_path}")
+# Verify the file
+if not pdf_file_path.endswith('.pdf'):
+    print("Warning: Please ensure you uploaded a PDF file")
+else:
+    print("PDF file ready for processing!")
+new_system_prompt = """You are a data extraction assistant.
+Extract the item details from the provided text.
+Provide the output as a JSON object, where object represents an item and has the following keys: 'Flag', 'Product Code', 'Description', 'Manufacturer', 'Supplier', 'Material', 'Dimensions', and 'Product Image'.
+If a key's value is not found in the text for an item, provide an empty string "".
+If no items are found, return an empty JSON {}.
+Do not include any extra text or formatting outside the JSON object.
+Include rows with unique Product Code values only.
+For the 'Dimensions' field, extract all dimension information found (e.g., Height, Width, Depth, Diameter, Length) and format them as a single string of key-value pairs separated by semicolons, like "Height: [value]; Width: [value]; Diameter: [value]". If a specific dimension is not available, do not include its key-value pair in the string.
+If we found the data from first page then take those only If there are any missing details or extra details then include with it.
+Do not include any duplicate data in any key of JSON."""
+# Your existing training data
+annotated_data_examples = [
+    {
+        "prompt": [
+            {"role": "system", "content": new_system_prompt},
+            {"role": "user", "content": "Text:\nProject Name: Anse La Mouche\nItem Number: GR-AA10\nDescription: Wall Hanging Art Work\nManufacturer: Harper + Wilde\nSupplier: Harper + Wilde\nMaterial/Finish: Hand Rolled Clay Beads, Cuttlefish Bone, Hemp Rope\nDimensions: Height: 300mm; Width: 250mm\nImage:\n[Image Placeholder]\n\nOutput JSON:"},
+        ],
+        "answer": '[{"Flag": "", "Product Code": "GR-AA10", "Description": "Wall Hanging Art Work", "Manufacturer": "Harper + Wilde", "Supplier": "Harper + Wilde", "Material": "Hand Rolled Clay Beads, Cuttlefish Bone, Hemp Rope", "Dimensions": "Height: 300mm; Width: 250mm", "Product Image": ""}]',
+    },
+    {
+        "prompt": [
+            {"role": "system", "content": new_system_prompt},
+            {"role": "user", "content": "Text:\nProject Name: Anse La Mouche\nItem Number: GR-AA12\nDescription: Mirror\nManufacturer: By Contractor\nMaterial/Finish: Clear Mirror (GR-GL02), Powder-Coated Black Aluminium Frame (GR-M03)\nDimensions: Height: 1010mm; Width: 600mm; Depth: 40mm\n\nOutput JSON:"},
+        ],
+        "answer": '[{"Flag": "", "Product Code": "GR-AA12", "Description": "Mirror", "Manufacturer": "By Contractor", "Supplier": "", "Material": "Clear Mirror (GR-GL02), Powder-Coated Black Aluminium Frame (GR-M03)", "Dimensions": "Height: 1010mm; Width: 600mm; Depth: 40mm", "Product Image": ""}]',
+    },
+]
+grpo_training_dataset = Dataset.from_list(annotated_data_examples)
+print("Training dataset created!")
+def format_reward(completions, **kwargs):
+    scores = []
+    for completion in completions:
+        score = 0.0
+        if completion and isinstance(completion, list) and len(completion) > 0 and 'content' in completion[0]:
+            response = completion[0]['content']
+            try:
+                parsed_response = json.loads(response.strip())
+                if isinstance(parsed_response, list):
+                    score += 3.0
+                else:
+                    score -= 1.0
+            except json.JSONDecodeError:
+                score -= 2.0
+        else:
+            score -= 2.0
+        scores.append(score)
+    return scores
+def accuracy_reward(prompts, completions, answer, **kwargs):
+    scores = []
+    expected_keys = ['Flag', 'Product Code', 'Description', 'Manufacturer', 'Supplier', 'Material', 'Dimensions', 'Product Image']
+    for completion, true_answer_str in zip(completions, answer):
+        score = 0.0
+        if completion and isinstance(completion, list) and len(completion) > 0 and 'content' in completion[0]:
+            response = completion[0]['content']
+            try:
+                parsed_response = json.loads(response.strip())
+                true_data = json.loads(true_answer_str.strip())
+                if isinstance(parsed_response, list) and isinstance(true_data, list):
+                    match_count = 0
+                    total_items = max(len(parsed_response), len(true_data))
+                    for i in range(total_items):
+                        parsed_item = parsed_response[i] if i < len(parsed_response) and isinstance(parsed_response[i], dict) else {}
+                        true_item = true_data[i] if i < len(true_data) and isinstance(true_data[i], dict) else {}
+                        key_matches = 0
+                        for key in expected_keys:
+                            parsed_value = parsed_item.get(key, "")
+                            true_value = true_item.get(key, "")
+                            if str(parsed_value).strip() == str(true_value).strip():
+                                key_matches += 1
+                        if len(expected_keys) > 0:
+                            match_count += key_matches / len(expected_keys)
+                    if total_items > 0:
+                        score += 5.0 * (match_count / total_items)
+                    else:
+                        if len(parsed_response) == 0 and len(true_data) == 0:
+                            score += 5.0
+                        else:
+                            score -= 2.0
+                else:
+                    score -= 2.0
+            except json.JSONDecodeError:
+                score -= 3.0
+        else:
+            score -= 2.0
+        scores.append(score)
+    return scores
+# Quick training (uncomment if needed)
+print("Training model... (This may take a few minutes)")
+chat_template = \
+    "{% if messages[0]['role'] == 'system' %}"\
+        "{{ messages[0]['content'] + eos_token }}"\
+        "{% set loop_messages = messages[1:] %}"\
+    "{% else %}"\
+        "{{ new_system_prompt + eos_token }}"\
+        "{% set loop_messages = messages %}"\
+    "{% endif %}"\
+    "{% for message in loop_messages %}"\
+        "{% if message['role'] == 'user' %}"\
+            "{{ message['content'] }}"\
+        "{% elif message['role'] == 'assistant' %}"\
+            "{{ message['content'] + eos_token }}"\
+        "{% endif %}"\
+    "{% endfor %}"
+tokenizer.chat_template = chat_template
+vllm_sampling_params = SamplingParams(
+    temperature=1.0,
+    top_k=50,
+    max_tokens=1024,
+    stop=[tokenizer.eos_token],
+    include_stop_str_in_output=True,
+)
+training_args = GRPOConfig(
+    vllm_sampling_params=vllm_sampling_params,
+    temperature=1.0,
+    learning_rate=5e-6,
+    weight_decay=0.01,
+    warmup_ratio=0.1,
+    lr_scheduler_type="linear",
+    optim="adamw_8bit",
+    logging_steps=1,
+    per_device_train_batch_size=2,  # Reduced for Colab
+    gradient_accumulation_steps=1,
+    max_prompt_length=512,
+    max_completion_length=512,
+    max_steps=10,  # Reduced for quick demo
+    save_steps=10,
+    report_to="none",
+    output_dir="outputs",
+)
+trainer = GRPOTrainer(
+    model=model,
+    processing_class=tokenizer,
+    reward_funcs=[format_reward, accuracy_reward],
+    args=training_args,
+    train_dataset=grpo_training_dataset,
+)
+trainer.train()
+model.save_lora("grpo_saved_lora")
+print("Model training completed and saved!")
+class ProductImageExtractor:
+    def __init__(self, pdf_path, model, tokenizer):
+        self.pdf_path = pdf_path
+        self.model = model
+        self.tokenizer = tokenizer
+        self.doc = None
+        self.lora_request = None
+        self.image_save_dir = "extracted_product_images"
+        self.load_lora("grpo_saved_lora")
+        self.setup_directories()
+    def load_lora(self, lora_path):
+        """Load trained LoRA adapter"""
+        if os.path.exists(lora_path):
+            try:
+                self.lora_request = self.model.load_lora(lora_path)
+                print(f"LoRA adapter loaded from {lora_path}")
+            except Exception as e:
+                print(f"Error loading LoRA: {e}")
+                self.lora_request = None
+    def setup_directories(self):
+        """Create necessary directories"""
+        os.makedirs(self.image_save_dir, exist_ok=True)
+        os.makedirs(f"{self.image_save_dir}/product_images", exist_ok=True)
+        os.makedirs(f"{self.image_save_dir}/non_product_images", exist_ok=True)
+        print("Directories created for image storage")
+    # def is_product_related_image(self, image_bbox, text_blocks, page_text):
+    #     """Determine if an image is product-related based on spatial proximity"""
+    #     # Extract product codes from page text
+    #     product_code_pattern = r'\b[A-Z]{2}-[A-Z]{2}\d+[a-z]?\b'
+    #     product_codes = re.findall(product_code_pattern, page_text)
+    #     print('--product codes', product_codes)
+    #     if not product_codes:
+    #         return False, None, 0.0
+    #     # Find text blocks containing product codes
+    #     product_text_blocks = []
+    #     for block in text_blocks:
+    #         if len(block) < 5:
+    #             continue
+    #         block_text = block[4]  # Text content
+    #         if any(code in block_text for code in product_codes):
+    #             product_text_blocks.append({
+    #                 'bbox': block[:4],  # x0, y0, x1, y1
+    #                 'text': block_text,
+    #                 'codes': [code for code in product_codes if code in block_text]
+    #             })
+    #     if not product_text_blocks:
+    #         return False, None, 0.0
+    #     # Calculate proximity scores
+    #     max_proximity_score = 0.0
+    #     closest_product_code = None
+    #     for block in product_text_blocks:
+    #         print('--product codes block', block['codes'])
+    #         proximity_score = self.calculate_proximity_score(image_bbox, block['bbox'])
+    #         if proximity_score > max_proximity_score:
+    #             max_proximity_score = proximity_score
+    #             closest_product_code = block['codes'][0] if block['codes'] else None
+    #     # Additional filters for non-product images
+    #     image_area = (image_bbox[2] - image_bbox[0]) * (image_bbox[3] - image_bbox[1])
+    #     # Filter out very small images (likely icons/logos)
+    #     if image_area < 3000:  # Adjusted threshold
+    #         return False, closest_product_code, max_proximity_score
+    #     # Filter out images in header/footer areas
+    #     page_height = 842  # A4 page height in points
+    #     if image_bbox[1] < 80 or image_bbox[3] > page_height - 80:
+    #         return False, closest_product_code, max_proximity_score
+    #     # Consider it product-related if proximity score is above threshold
+    #     is_product = max_proximity_score > 0.2  # Lowered threshold for better detection
+    #     return is_product, closest_product_code, max_proximity_score
+    def is_product_related_image(self, image_bbox, text_blocks, page_text):
+      """Determine if an image is product-related based on spatial proximity"""
+      # Extract product codes from page text
+      product_code_pattern = r'\b[A-Z]{2}-[A-Z]{2}\d+[a-z]?\b'
+      product_codes = re.findall(product_code_pattern, page_text)
+      print('--product codes', product_codes)
+      if not product_codes:
+          return False, None, 0.0
+      # Find text blocks containing product codes
+      product_text_blocks = []
+      for block in text_blocks:
+          if len(block) < 5:
+              continue
+          block_text = block[4]  # Text content
+          if any(code in block_text for code in product_codes):
+              product_text_blocks.append({
+                  'bbox': block[:4],  # x0, y0, x1, y1
+                  'text': block_text,
+                  'codes': [code for code in product_codes if code in block_text]
+              })
+      if not product_text_blocks:
+          return False, None, 0.0
+      # Calculate proximity scores
+      max_proximity_score = 0.0
+      closest_product_code = None
+      for block in product_text_blocks:
+          print('--product codes block', block['codes'])
+          proximity_score = self.calculate_proximity_score(image_bbox, block['bbox'])
+          # Immediate return if a high score is found
+          if proximity_score > 0.2:  # Use the same threshold as the final check
+              max_proximity_score = proximity_score
+              closest_product_code = block['codes'][0] if block['codes'] else None
+              is_product = self.additional_filters(image_bbox, max_proximity_score)
+              return is_product, closest_product_code, max_proximity_score
+          if proximity_score > max_proximity_score:
+              max_proximity_score = proximity_score
+              closest_product_code = block['codes'][0] if block['codes'] else None
+      # Apply additional filters to the best-found score
+      is_product = self.additional_filters(image_bbox, max_proximity_score)
+      return is_product, closest_product_code, max_proximity_score
+    def additional_filters(self, image_bbox, max_proximity_score):
+      """Helper function to apply additional filters"""
+      image_area = (image_bbox[2] - image_bbox[0]) * (image_bbox[3] - image_bbox[1])
+      # Filter out very small images (likely icons/logos)
+      if image_area < 3000:
+          return False
+      # Filter out images in header/footer areas
+      page_height = 842  # A4 page height in points
+      if image_bbox[1] < 80 or image_bbox[3] > page_height - 80:
+          return False
+      # Consider it product-related if proximity score is above threshold
+      return max_proximity_score > 0.2
+    def calculate_proximity_score(self, image_bbox, text_bbox):
+        """Calculate proximity score between image and text bounding boxes"""
+        img_center_x = (image_bbox[0] + image_bbox[2]) / 2
+        img_center_y = (image_bbox[1] + image_bbox[3]) / 2
+        text_center_x = (text_bbox[0] + text_bbox[2]) / 2
+        text_center_y = (text_bbox[1] + text_bbox[3]) / 2
+        distance = ((img_center_x - text_center_x) ** 2 + (img_center_y - text_center_y) ** 2) ** 0.5
+        proximity_score = max(0, 1 - (distance / 800))  # Adjusted for better scoring
+        return proximity_score
+    def extract_and_classify_images(self, page, page_num):
+        """Extract images from page and classify as product-related or not"""
+        images = page.get_images(full=True)
+        text_blocks = page.get_text("blocks")
+        page_text = page.get_text()
+        product_images = []
+        non_product_images = []
+        for img_index, img_info in enumerate(images):
+            xref = img_info[0]
+            try:
+                # Get image bounding box
+                image_list = page.get_image_rects(xref)
+                if not image_list:
+                    continue
+                image_bbox = image_list[0]  # First occurrence
+                # Classify image
+                is_product, product_code, proximity_score = self.is_product_related_image(
+                    image_bbox, text_blocks, page_text
+                )
+                # Extract and save image
+                pix = fitz.Pixmap(self.doc, xref)
+                if pix.n - pix.alpha > 3:  # Handle CMYK images
+                    pix = fitz.Pixmap(fitz.csRGB, pix)
+                # Generate filename
+                if is_product and product_code:
+                    category = "product_images"
+                    filename = f"page{page_num}_{product_code}_img{img_index+1}.png"
+                else:
+                    category = "non_product_images"
+                    filename = f"page{page_num}_generic_img{img_index+1}.png"
+                image_path = os.path.join(self.image_save_dir, category, filename)
+                pix.save(image_path)
+                image_data = {
+                    'path': image_path,
+                    'bbox': image_bbox,
+                    'product_code': product_code,
+                    'proximity_score': proximity_score,
+                    'xref': xref,
+                    'size': (pix.width, pix.height)
+                }
+                if is_product:
+                    product_images.append(image_data)
+                    print(f"✓ Product image: {filename} (Code: {product_code}, Score: {proximity_score:.2f})")
+                else:
+                    non_product_images.append(image_data)
+                    print(f"• Non-product image: {filename}")
+                pix = None  # Release memory
+            except Exception as e:
+                print(f"Error extracting image {img_index+1} on page {page_num}: {e}")
+        return product_images, non_product_images
+    def merge_product_data(self, first_page_item, additional_item):
+        """Merge product data, prioritizing first page data but filling in missing details"""
+        merged_item = first_page_item.copy()
+        # Fill in missing or empty fields from additional item
+        for key in ['Flag', 'Description', 'Manufacturer', 'Supplier', 'Material', 'Dimensions', 'Product Image']:
+            if not merged_item.get(key, '').strip() and additional_item.get(key, '').strip():
+                merged_item[key] = additional_item[key]
+                print(f"    → Added missing {key}: {additional_item[key][:50]}...")
+        # For image, prefer the one with better proximity score or first occurrence
+        if not merged_item.get('Product Image File', '') and additional_item.get('Product Image File', ''):
+            merged_item['Product Image File'] = additional_item['Product Image File']
+            print(f"    → Added missing image: {os.path.basename(additional_item['Product Image File'])}")
+        return merged_item
+    def extract_product_data_with_images(self):
+        """Main extraction function with duplicate consolidation"""
+        try:
+            self.doc = fitz.open(self.pdf_path)
+            total_pages = self.doc.page_count  # Store page count before processing
+            print(f"Processing PDF: {self.pdf_path}")
+            print(f"Total pages: {total_pages}")
+        except Exception as e:
+            print(f"Error opening PDF: {e}")
+            return None
+        all_product_images = {}  # Dict to store images by product code
+        product_data_tracker = {}  # Track products by code to avoid duplicates
+        # Setup inference parameters
+        sampling_params = SamplingParams(
+            temperature=0.1,
+            top_p=1.0,
+            max_tokens=1024,
+            stop=[self.tokenizer.eos_token],
+            include_stop_str_in_output=True,
+        )
+        for page_num in range(total_pages):
+            page = self.doc.load_page(page_num)
+            page_text = page.get_text()
+            print(f"\n--- Processing page {page_num + 1} ---")
+            # Extract and classify images
+            product_images, non_product_images = self.extract_and_classify_images(page, page_num + 1)
+            # Group product images by product code
+            for img_data in product_images:
+                if img_data['product_code']:
+                    if img_data['product_code'] not in all_product_images:
+                        all_product_images[img_data['product_code']] = []
+                    all_product_images[img_data['product_code']].append(img_data)
+            # Extract product data using trained model
+            messages = [
+                {"role": "system", "content": new_system_prompt},
+                {"role": "user", "content": f"Text:\n{page_text}\n\nOutput JSON:"},
+            ]
+            prompt_text = self.tokenizer.apply_chat_template(
+                messages,
+                add_generation_prompt=False,
+                tokenize=False,
+            )
+            try:
+                raw_model_output = self.model.fast_generate(
+                    prompt_text,
+                    sampling_params=sampling_params,
+                    lora_request=self.lora_request,
+                )[0].outputs[0].text
+                # Parse model output
+                cleaned_output = raw_model_output.strip()
+                parsed_data = json.loads(cleaned_output)
+                if isinstance(parsed_data, dict):
+                    parsed_data = [parsed_data]
+                elif not isinstance(parsed_data, list):
+                    parsed_data = []
+                # Process extracted items and handle duplicates
+                for item in parsed_data:
+                    if isinstance(item, dict):
+                        product_code = item.get('Product Code', '').strip()
+                        # Skip items without product codes
+                        if not product_code:
+                            continue
+                        # Find best matching image for this product
+                        image_path = ""
+                        if product_code in all_product_images:
+                            best_image = max(
+                                all_product_images[product_code],
+                                key=lambda x: x['proximity_score']
+                            )
+                            image_path = best_image['path']
+                        # Create complete item record
+                        current_item_data = {
+                            "pdf_page_number": page_num + 1,
+                            "Flag": item.get('Flag', ''),
+                            "Product Code": product_code,
+                            "Description": item.get('Description', ''),
+                            "Manufacturer": item.get('Manufacturer', ''),
+                            "Supplier": item.get('Supplier', ''),
+                            "Material": item.get('Material', ''),
+                            "Dimensions": item.get('Dimensions', ''),
+                            "Product Image": item.get('Product Image', ''),
+                            "Product Image File": image_path,
+                        }
+                        # Check if this product code already exists
+                        if product_code in product_data_tracker:
+                            print(f"  ! Duplicate found for {product_code} on page {page_num + 1}")
+                            # Merge with existing data (prioritize first occurrence)
+                            existing_item = product_data_tracker[product_code]
+                            merged_item = self.merge_product_data(existing_item, current_item_data)
+                            product_data_tracker[product_code] = merged_item
+                        else:
+                            # First occurrence of this product code
+                            print(f"  ✓ New product: {product_code}")
+                            if image_path:
+                                print(f"    → Linked image: {os.path.basename(image_path)}")
+                            product_data_tracker[product_code] = current_item_data
+            except Exception as e:
+                print(f"Error processing page {page_num + 1}: {e}")
+        # Close document before processing final data
+        self.doc.close()
+        # Convert tracker to final list (this ensures no duplicates)
+        final_data = list(product_data_tracker.values())
+        print(f"\n=== DEDUPLICATION SUMMARY ===")
+        print(f"Unique products found: {len(final_data)}")
+        print(f"Pages processed: {total_pages}")
+        # Verify no duplicates exist
+        product_codes = [item.get('Product Code', '') for item in final_data]
+        unique_codes = set(product_codes)
+        if len(product_codes) != len(unique_codes):
+            print(f"WARNING: Found {len(product_codes) - len(unique_codes)} duplicate entries!")
+        else:
+            print("✓ No duplicate product codes confirmed")
+        return final_data
+print("ProductImageExtractor class defined!")
+print("Starting extraction process...")
+# Initialize extractor
+extractor = ProductImageExtractor(pdf_file_path, model, tokenizer)
+# Extract data and images
+extracted_data = extractor.extract_product_data_with_images()
+if extracted_data:
+    # Convert to DataFrame for display
+    df_results = pd.DataFrame(extracted_data)
+    print(f"\n=== EXTRACTION COMPLETED ===")
+    print(f"Total items extracted: {len(df_results)}")
+    print(f"Items with product images: {len([item for item in extracted_data if item['Product Image File']])}")
+    # Display first few results
+    print("\n=== SAMPLE RESULTS ===")
+    display_columns = ['Product Code', 'Description', 'Manufacturer', 'Product Image File']
+    print(df_results[display_columns].head(10).to_string(index=False))
+else:
+    print("Failed to extract data from PDF")
+def create_excel_with_embedded_images(data, output_filename):
+    """Create Excel file with properly embedded and displayed images"""
+    df = pd.DataFrame(data)
+    print(f"Creating Excel file: {output_filename}")
+    # Create Excel writer with xlsxwriter engine
+    with pd.ExcelWriter(output_filename, engine='xlsxwriter') as writer:
+        df.to_excel(writer, sheet_name='Product Data', index=False)
+        workbook = writer.book
+        worksheet = writer.sheets['Product Data']
+        # Auto-calculate column widths based on content length
+        def calculate_column_width(column_data, column_name, min_width=8, max_width=50):
+            """Calculate optimal column width based on content"""
+            if len(column_data) == 0:
+                return min_width
+            # Get max length of content in this column
+            max_length = max(
+                len(str(value)) for value in [column_name] + list(column_data)
+            )
+            # Apply some padding and limits
+            optimal_width = min(max(max_length * 1.2, min_width), max_width)
+            return optimal_width
+        # Set auto-calculated column widths
+        for col_idx, column_name in enumerate(df.columns):
+            if column_name == "Product Image":
+                # Increased width for image column to prevent overflow
+                worksheet.set_column(col_idx, col_idx, 20)
+            elif column_name == "Product Image File":
+                # Fixed width for image file path column
+                worksheet.set_column(col_idx, col_idx, 25)
+            elif column_name == "Description":
+                # Limit description width to avoid too wide columns
+                width = calculate_column_width(df[column_name], column_name, min_width=15, max_width=40)
+                worksheet.set_column(col_idx, col_idx, width)
+            elif column_name == "Material":
+                width = calculate_column_width(df[column_name], column_name, min_width=12, max_width=35)
+                worksheet.set_column(col_idx, col_idx, width)
+            elif column_name == "Dimensions":
+                width = calculate_column_width(df[column_name], column_name, min_width=15, max_width=30)
+                worksheet.set_column(col_idx, col_idx, width)
+            else:
+                # Auto-calculate for other columns
+                width = calculate_column_width(df[column_name], column_name)
+                worksheet.set_column(col_idx, col_idx, width)
+            print(f"Column '{column_name}': width = {width if 'width' in locals() else 'auto'}")
+        # Find the image column index
+        try:
+            image_col_index = df.columns.get_loc("Product Image")
+            # Uniform image size settings
+            UNIFORM_IMAGE_WIDTH = 120  # pixels
+            UNIFORM_IMAGE_HEIGHT = 120  # pixels
+            CELL_ROW_HEIGHT = 100  # points (Excel row height)
+            # Insert images into cells with uniform sizing
+            images_inserted = 0
+            for row_num in range(1, len(df) + 1):  # Start from row 1 (skip header)
+                image_path = df.iloc[row_num - 1]['Product Image File']
+                if image_path and os.path.exists(image_path):
+                    try:
+                        # Set consistent row height for all image rows
+                        worksheet.set_row(row_num, CELL_ROW_HEIGHT)
+                        # Get original image dimensions to calculate scaling
+                        with Image.open(image_path) as img:
+                            original_width, original_height = img.size
+                        # Calculate scaling factors to achieve uniform size
+                        scale_x = UNIFORM_IMAGE_WIDTH / original_width
+                        scale_y = UNIFORM_IMAGE_HEIGHT / original_height
+                        # Use the smaller scale to maintain aspect ratio while fitting in target size
+                        uniform_scale = min(scale_x, scale_y)
+                        # Insert image with uniform scaling
+                        worksheet.insert_image(
+                            row_num, image_col_index, image_path,
+                            {
+                                'x_scale': uniform_scale,
+                                'y_scale': uniform_scale,
+                                'x_offset': 5,  # Small offset from cell border
+                                'y_offset': 5,
+                                'positioning': 1  # Move and size with cells
+                            }
+                        )
+                        images_inserted += 1
+                        print(f"  → Inserted uniform image {images_inserted}: {os.path.basename(image_path)} "
+                              f"(scale: {uniform_scale:.2f}, orig: {original_width}x{original_height})")
+                    except Exception as e:
+                        print(f"Error embedding image {image_path}: {e}")
+            print(f"\nExcel file created with {images_inserted} uniformly-sized embedded images!")
+            print(f"All images scaled to approximately {UNIFORM_IMAGE_WIDTH}x{UNIFORM_IMAGE_HEIGHT} pixels")
+        except KeyError:
+            print("Product Image File column not found")
+        # Add formatting for better appearance
+        header_format = workbook.add_format({
+            'bold': True,
+            'text_wrap': True,
+            'valign': 'top',
+            'fg_color': '#D7E4BC',
+            'border': 1
+        })
+        # Apply header formatting
+        for col_num, value in enumerate(df.columns.values):
+            worksheet.write(0, col_num, value, header_format)
+        # Add text wrapping for content cells
+        wrap_format = workbook.add_format({
+            'text_wrap': True,
+            'valign': 'top',
+            'border': 1
+        })
+        image_cell_format = workbook.add_format({
+            'border': 1,
+            'valign': 'top'
+        })
+        # Apply text wrapping to data cells (excluding image column)
+        for row_num in range(1, len(df) + 1):
+            for col_num in range(len(df.columns)):
+                cell_value = df.iloc[row_num - 1, col_num]
+                if col_num == image_col_index:  # Image column gets special formatting
+                    worksheet.write(row_num, col_num, '', image_cell_format)  # Empty cell with borders
+                else:
+                    worksheet.write(row_num, col_num, cell_value, wrap_format)
+if extracted_data:
+    output_excel = "product_data_with_images.xlsx"
+    create_excel_with_embedded_images(extracted_data, output_excel)
+    # Create summary statistics
+    df_results = pd.DataFrame(extracted_data)
+    total_items = len(df_results)
+    items_with_images = len(df_results[df_results['Product Image File'] != ''])
+    unique_products = len(df_results[df_results['Product Code'] != '']['Product Code'].unique())
+    print(f"\n=== FINAL SUMMARY ===")
+    print(f"Total items extracted: {total_items}")
+    print(f"Items with images: {items_with_images}")
+    print(f"Unique products: {unique_products}")
+    print(f"Images saved in: {extractor.image_save_dir}")
+    print(f"Excel file: {output_excel}")
+print("Preparing files for download...")
+# Import the correct files module for Colab
+from google.colab import files as colab_files
+# Create a zip file with all results
+# zip_filename = "extraction_results.zip"
+# with zipfile.ZipFile(zip_filename, 'w') as zipf:
+#     # Add Excel file
+#     if os.path.exists("product_data_with_images.xlsx"):
+#         zipf.write("product_data_with_images.xlsx")
+#     # Add all extracted images
+#     if os.path.exists("extracted_product_images"):
+#         for root, dirs, files_list in os.walk("extracted_product_images"):
+#             for file in files_list:
+#                 file_path = os.path.join(root, file)
+#                 arcname = os.path.relpath(file_path, ".")
+#                 zipf.write(file_path, arcname)
+# print(f"Created zip file: {zip_filename}")
+# # Download the zip file
+# if os.path.exists(zip_filename):
+#     colab_files.download(zip_filename)
+#     print("Download started! Check your downloads folder.")
+# else:
+#     print("Error creating zip file")
+# Also download Excel separately
+if os.path.exists("product_data_with_images.xlsx"):
+    colab_files.download("product_data_with_images.xlsx")
+    print("Excel file download started!")
+print("\nExtraction completed successfully!")
+print("You should now have:")
+print("1. product_data_with_images.xlsx - Excel file with embedded images")
+# print("2. extraction_results.zip - Complete package with all files")
+def run_quality_check(extracted_data):
+    """Run quality checks on extracted data"""
+    df = pd.DataFrame(extracted_data)
+    print("=== QUALITY CHECK REPORT ===")
+    # Basic statistics
+    print(f"Total records: {len(df)}")
+    print(f"Records with Product Code: {len(df[df['Product Code'] != ''])}")
+    print(f"Records with Description: {len(df[df['Description'] != ''])}")
+    print(f"Records with Images: {len(df[df['Product Image File'] != ''])}")
+    # Product code analysis
+    product_codes = df[df['Product Code'] != '']['Product Code'].tolist()
+    unique_codes = set(product_codes)
+    print(f"Unique Product Codes: {len(unique_codes)}")
+    if product_codes:
+        print("Sample Product Codes:", list(unique_codes)[:5])
+    # Image file verification
+    image_files = df[df['Product Image File'] != '']['Product Image File'].tolist()
+    existing_images = [f for f in image_files if os.path.exists(f)]
+    print(f"Image files that exist: {len(existing_images)}/{len(image_files)}")
+    # Manufacturer analysis
+    manufacturers = df[df['Manufacturer'] != '']['Manufacturer'].unique()
+    print(f"Unique Manufacturers: {len(manufacturers)}")
+    return {
+        'total_records': len(df),
+        'records_with_codes': len(df[df['Product Code'] != '']),
+        'records_with_images': len(df[df['Product Image File'] != '']),
+        'unique_codes': len(unique_codes),
+        'existing_images': len(existing_images)
+    }
+if extracted_data:
+    quality_stats = run_quality_check(extracted_data)
+model_name = "Qwen3_4B_Base_fine_tuned"
+model.save_pretrained(model_name)
+tokenizer.save_pretrained(model_name)
+model.push_to_hub("pragneshr002/Qwen3_4B_Base_fine_tuned")
+model.push_to_hub_gguf(model_name, tokenizer, quantization_method="q4_k_m")