Spaces:

Alteredverse
/

open-catalog-parser

Build error

App Files Files Community

minar09 commited on Feb 7, 2025

Commit

29de06b

verified ·

1 Parent(s): b153f50

Update main.py

Browse files

Files changed (1) hide show

main.py +65 -69

main.py CHANGED Viewed

@@ -8,7 +8,7 @@ from dataclasses import dataclass
 from fastapi.encoders import jsonable_encoder
 import fitz  # PyMuPDF
 from sentence_transformers import SentenceTransformer
-from llama_cpp import Llama
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -29,46 +29,38 @@ class ProductSpec:
 class PDFProcessor:
     def __init__(self):
         self.emb_model = self._initialize_emb_model("all-MiniLM-L6-v2")
-        # Choose the appropriate model filename below; adjust if needed.
-        # self.llm = self._initialize_llm("deepseek-llm-7b-base.Q2_K.gguf")
-        self.llm = self._initialize_llm("llama-2-7b.Q2_K.gguf")
         self.output_dir = Path("./output")
         self.output_dir.mkdir(exist_ok=True)
     def _initialize_emb_model(self, model_name):
         try:
-            # Use SentenceTransformer if available
-            return SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
         except Exception as e:
-            logger.warning(f"SentenceTransformer failed: {e}. Falling back to transformers model.")
             from transformers import AutoTokenizer, AutoModel
-            tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/" + model_name)
-            model = AutoModel.from_pretrained("sentence-transformers/" + model_name)
             return model
-    def _initialize_llm(self, model_name):
-        """Initialize LLM with automatic download if needed"""
-        # Here we use from_pretrained so that if the model is missing locally it downloads it.
-        model_path = os.path.join("models/", model_name)
-        if os.path.exists(model_path):
-            return Llama(
-                model_path=model_path,
-                n_ctx=1024,
-                n_gpu_layers=-1,
-                n_threads=os.cpu_count() - 1,
-                verbose=False
-            )
-        else:
-            return Llama.from_pretrained(
-                repo_id="Tien203/llama.cpp",
-                filename="Llama-2-7b-hf-q4_0.gguf",
             )
     def process_pdf(self, pdf_path: str) -> Dict:
-        """Process PDF using PyMuPDF"""
         start_time = time.time()
-        # Open PDF
         try:
             doc = fitz.open(pdf_path)
         except Exception as e:
@@ -78,37 +70,63 @@ class PDFProcessor:
         text_blocks = []
         tables = []
-        # Extract text and tables from each page
         for page_num, page in enumerate(doc):
-            # Extract text blocks from page and filter out very short blocks (noise)
             blocks = self._extract_text_blocks(page)
-            filtered = [block for block in blocks if len(block.strip()) >= 10]
-            logger.debug(f"Page {page_num + 1}: Extracted {len(blocks)} blocks, {len(filtered)} kept after filtering.")
-            text_blocks.extend(filtered)
-            # Extract tables (if any)
             tables.extend(self._extract_tables(page, page_num))
-        # Process text blocks with LLM to extract product information
         products = []
         for idx, block in enumerate(text_blocks):
-            # Log the text block for debugging
-            logger.debug(f"Processing text block {idx}: {block[:100]}...")
             product = self._process_text_block(block)
-            if product:
                 product.tables = tables
-                # Only add if at least one key (like name) is non-empty
-                if product.name or product.description or product.price or (
-                        product.attributes and len(product.attributes) > 0):
-                    products.append(product.to_dict())
-                else:
-                    logger.debug(f"LLM returned empty product for block {idx}.")
-            else:
-                logger.debug(f"No product extracted from block {idx}.")
         logger.info(f"Processed {len(products)} products in {time.time() - start_time:.2f}s")
         return {"products": products, "tables": tables}
     def _extract_text_blocks(self, page) -> List[str]:
         """Extract text blocks from a PDF page using PyMuPDF's blocks method."""
         blocks = []
@@ -138,27 +156,6 @@ class PDFProcessor:
             logger.warning(f"Error extracting tables from page {page_num + 1}: {e}")
         return tables
-    def _process_text_block(self, text: str) -> Optional[ProductSpec]:
-        """Process a text block with LLM to extract product specifications."""
-        prompt = self._generate_query_prompt(text)
-        logger.debug(f"Generated prompt: {prompt[:200]}...")
-        try:
-            response = self.llm.create_chat_completion(
-                messages=[{"role": "user", "content": prompt}],
-                temperature=0.1,
-                max_tokens=512
-            )
-            # Debug: log raw response
-            logger.debug(f"LLM raw response: {response}")
-            return self._parse_response(response['choices'][0]['message']['content'])
-        except Exception as e:
-            logger.warning(f"Error processing text block: {e}")
-            return None
-    def _generate_query_prompt(self, text: str) -> str:
-        """Generate a prompt instructing the LLM to extract product information."""
-        return f"""Extract product specifications from the following text. If no product is found, return an empty JSON object with keys.\n\nText:\n{text}\n\nReturn JSON format exactly as:\n{{\n    \"name\": \"product name\",\n    \"description\": \"product description\",\n    \"price\": numeric_price,\n    \"attributes\": {{ \"key\": \"value\" }}\n}}"""
     def _parse_response(self, response: str) -> Optional[ProductSpec]:
         """Parse the LLM's response to extract a product specification."""
         try:
@@ -193,7 +190,6 @@ def process_pdf_catalog(pdf_path: str):
 if __name__ == "__main__":
-    # Example usage: change this if you call process_pdf_catalog elsewhere
     pdf_path = "path/to/your/pdf_file.pdf"
     result, message = process_pdf_catalog(pdf_path)
-    print(result, message)

 from fastapi.encoders import jsonable_encoder
 import fitz  # PyMuPDF
 from sentence_transformers import SentenceTransformer
+from mlc_llm import MLCEngine
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class PDFProcessor:
     def __init__(self):
         self.emb_model = self._initialize_emb_model("all-MiniLM-L6-v2")
+        self.llm = self._initialize_llm()
         self.output_dir = Path("./output")
         self.output_dir.mkdir(exist_ok=True)
     def _initialize_emb_model(self, model_name):
         try:
+            return SentenceTransformer(f'sentence-transformers/{model_name}')
         except Exception as e:
+            logger.warning(f"SentenceTransformer failed: {e}")
             from transformers import AutoTokenizer, AutoModel
+            tokenizer = AutoTokenizer.from_pretrained(f"sentence-transformers/{model_name}")
+            model = AutoModel.from_pretrained(f"sentence-transformers/{model_name}")
             return model
+    def _initialize_llm(self):
+        """Initialize MLC LLM engine with optimized settings"""
+        try:
+            return MLCEngine(
+                model="HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC",
+                mode="server",
+                device="cuda" if os.getenv("USE_CUDA", "0") == "1" else "auto",
+                temperature=0.1,
+                max_tokens=512
             )
+        except Exception as e:
+            logger.error(f"Failed to initialize MLC Engine: {e}")
+            raise
     def process_pdf(self, pdf_path: str) -> Dict:
+        """Main PDF processing pipeline"""
         start_time = time.time()
         try:
             doc = fitz.open(pdf_path)
         except Exception as e:
         text_blocks = []
         tables = []
         for page_num, page in enumerate(doc):
             blocks = self._extract_text_blocks(page)
+            text_blocks.extend([b for b in blocks if len(b.strip()) >= 10])
             tables.extend(self._extract_tables(page, page_num))
         products = []
         for idx, block in enumerate(text_blocks):
             product = self._process_text_block(block)
+            if product and self._is_valid_product(product):
                 product.tables = tables
+                products.append(product.to_dict())
         logger.info(f"Processed {len(products)} products in {time.time() - start_time:.2f}s")
         return {"products": products, "tables": tables}
+    def _process_text_block(self, text: str) -> Optional[ProductSpec]:
+        """Process text with MLC LLM using optimized prompt"""
+        try:
+            prompt = self._generate_query_prompt(text)
+            response = self.llm.chat.completions.create(
+                messages=[{"role": "user", "content": prompt}],
+                stream=False
+            )
+            return self._parse_response(response.choices[0].message.content)
+        except Exception as e:
+            logger.warning(f"Error processing text block: {e}")
+            return None
+    def _generate_query_prompt(self, text: str) -> str:
+        """Generate structured prompt for better JSON response"""
+        return f"""Extract product specifications as JSON from this text:
+        Text: {text}
+        Return valid JSON with exactly these keys:
+        - name (string)
+        - description (string, optional)
+        - price (number, optional)
+        - attributes (object with key-value pairs, optional)
+        Example:
+        {{
+            "name": "Example Product",
+            "description": "High-quality example item",
+            "price": 99.99,
+            "attributes": {{"color": "red", "size": "XL"}}
+        }}"""
+    def _is_valid_product(self, product: ProductSpec) -> bool:
+        """Validate extracted product data"""
+        return any([
+            product.name,
+            product.description,
+            product.price,
+            product.attributes
+        ])
     def _extract_text_blocks(self, page) -> List[str]:
         """Extract text blocks from a PDF page using PyMuPDF's blocks method."""
         blocks = []
             logger.warning(f"Error extracting tables from page {page_num + 1}: {e}")
         return tables
     def _parse_response(self, response: str) -> Optional[ProductSpec]:
         """Parse the LLM's response to extract a product specification."""
         try:
 if __name__ == "__main__":
     pdf_path = "path/to/your/pdf_file.pdf"
     result, message = process_pdf_catalog(pdf_path)
+    print(json.dumps(result, indent=2), message)