Spaces:

rairo
/

iris-products-api

Sleeping

App Files Files Community

rairo commited on Sep 20, 2025

Commit

d1f57b9

verified ·

1 Parent(s): 35e9aaf

Create main.py

Browse files

Files changed (1) hide show

main.py +290 -0

main.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import os
+import io
+import logging
+import re
+import pandas as pd
+import pdfplumber
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+from flask_sqlalchemy import SQLAlchemy
+from sqlalchemy.exc import IntegrityError
+from thefuzz import process, fuzz
+from werkzeug.utils import secure_filename
+# ───────────────────────────────────────────────────────────────────────────────
+# CONFIGURATION
+# ───────────────────────────────────────────────────────────────────────────────
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("product-pipeline-api")
+app = Flask(__name__)
+CORS(app)
+# --- Database Configuration (Mocking MySQL with SQLite) ---
+# Use an in-memory SQLite database for simplicity and portability.
+# This mimics the real database without requiring a MySQL server for development.
+app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///:memory:'
+app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
+app.config['UPLOAD_FOLDER'] = 'uploads'
+os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+db = SQLAlchemy(app)
+# ───────────────────────────────────────────────────────────────────────────────
+# DATABASE MODEL (Based on products-20.sql)
+# ───────────────────────────────────────────────────────────────────────────────
+class Product(db.Model):
+    """Represents the 'products' table."""
+    __tablename__ = 'products'
+    id = db.Column(db.Integer, primary_key=True)
+    name = db.Column(db.String(255), nullable=False, unique=True)
+    category_id = db.Column(db.Integer, nullable=False, default=1)
+    primary_category = db.Column(db.String(255), nullable=False, default='N/A')
+    hs_code = db.Column(db.String(255), nullable=True)
+    def to_dict(self):
+        """Serializes the Product object to a dictionary."""
+        return {
+            'id': self.id,
+            'name': self.name,
+            'category_id': self.category_id,
+            'primary_category': self.primary_category,
+            'hs_code': self.hs_code
+        }
+    def __repr__(self):
+        return f'<Product {self.id}: {self.name}>'
+# ───────────────────────────────────────────────────────────────────────────────
+# DATA LOADING & PRE-PROCESSING
+# ───────────────────────────────────────────────────────────────────────────────
+# --- Constants for Validation Logic ---
+FUZZY_MATCH_THRESHOLD = 85  # Similarity score (out of 100) to consider a match
+# --- In-memory cache for essential data ---
+HS_CODES_DATA = []
+EXISTING_PRODUCT_NAMES = []
+HS_CODE_DESCRIPTIONS = {}
+def parse_hs_codes_pdf(filepath='HS Codes for use under FDMS.pdf'):
+    """Extracts HS codes and descriptions from the PDF file."""
+    log.info(f"Parsing HS Codes from '{filepath}'...")
+    if not os.path.exists(filepath):
+        log.error(f"HS Code PDF not found at '{filepath}'. Categorization will fail.")
+        return []
+    codes = []
+    try:
+        with pdfplumber.open(filepath) as pdf:
+            for page in pdf.pages:
+                text = page.extract_text()
+                # Regex to find HS codes and their descriptions.
+                # It looks for a pattern of numbers (code) followed by text (description).
+                matches = re.findall(r'\"(\d+)\n\"\,?\"(.*?)\n\"', text, re.DOTALL)
+                for code, desc in matches:
+                    clean_desc = desc.replace('\n', ' ').strip()
+                    if code and clean_desc:
+                        codes.append({'code': code, 'description': clean_desc})
+                        HS_CODE_DESCRIPTIONS[clean_desc] = code
+    except Exception as e:
+        log.error(f"Failed to parse PDF: {e}")
+    log.info(f"Successfully parsed {len(codes)} HS codes.")
+    return codes
+def load_existing_products(filepath='Product List.csv'):
+    """Loads the master product list for validation."""
+    log.info(f"Loading master product list from '{filepath}'...")
+    if not os.path.exists(filepath):
+        log.error(f"Master product list not found at '{filepath}'. Validation may be inaccurate.")
+        return []
+    try:
+        df = pd.read_csv(filepath)
+        # Drop duplicates to ensure a clean list for matching
+        product_names = df['name'].dropna().unique().tolist()
+        log.info(f"Loaded {len(product_names)} unique existing products.")
+        return product_names
+    except Exception as e:
+        log.error(f"Failed to load master product list: {e}")
+        return []
+# ───────────────────────────────────────────────────────────────────────────────
+# CORE PROCESSING PIPELINE
+# ───────────────────────────────────────────────────────────────────────────────
+def process_uploaded_file(filepath):
+    """
+    The main pipeline to validate, clean, categorize, and store product data.
+    """
+    log.info(f"Starting processing for file: {filepath}")
+    results = {
+        "processed": 0,
+        "added": 0,
+        "updated": 0,
+        "skipped_duplicates": 0,
+        "errors": [],
+        "processed_data": []
+    }
+    try:
+        # The uploaded file might not have headers, so we name the column
+        df = pd.read_csv(filepath, header=None, names=['product_name_raw'])
+    except Exception as e:
+        log.error(f"Could not read CSV: {e}")
+        results['errors'].append(f"Invalid CSV format: {e}")
+        return results
+    # Extract the column with product names, even if its index is not 0
+    product_name_col = None
+    for col in df.columns:
+        # Heuristic: find the column that seems to contain product names (string type)
+        if df[col].dtype == 'object' and df[col].astype(str).str.contains('[a-zA-Z]').any():
+            product_name_col = col
+            break
+    if product_name_col is None:
+        results['errors'].append("Could not find a column with product names in the uploaded CSV.")
+        return results
+    for index, row in df.iterrows():
+        raw_name = row[product_name_col]
+        results['processed'] += 1
+        if not isinstance(raw_name, str) or not raw_name.strip():
+            continue # Skip empty rows
+        # --- 1. Validation & Cleaning using Fuzzy Matching ---
+        # Find the closest match from the master product list
+        best_match, score = process.extractOne(
+            raw_name, EXISTING_PRODUCT_NAMES, scorer=fuzz.token_sort_ratio
+        ) if EXISTING_PRODUCT_NAMES else (raw_name, 100)
+        cleaned_name = best_match if score >= FUZZY_MATCH_THRESHOLD else raw_name
+        log.info(f"'{raw_name}' -> '{cleaned_name}' (Score: {score})")
+        # --- 2. HS Code Categorization ---
+        # Find the best HS code description match for the cleaned name
+        best_hs_desc, _ = process.extractOne(
+            cleaned_name, HS_CODE_DESCRIPTIONS.keys()
+        ) if HS_CODE_DESCRIPTIONS else (None, 0)
+        hs_code = HS_CODE_DESCRIPTIONS.get(best_hs_desc)
+        log.info(f"Assigned HS Code: {hs_code} (Based on: '{best_hs_desc}')")
+        # --- 3. Database Operation ---
+        processed_entry = {
+            "raw_name": raw_name,
+            "cleaned_name": cleaned_name,
+            "hs_code": hs_code,
+            "primary_category": best_hs_desc or "N/A",
+            "status": ""
+        }
+        try:
+            # Check if a product with this cleaned name already exists
+            existing_product = Product.query.filter_by(name=cleaned_name).first()
+            if existing_product:
+                # Update existing product if HS code is new
+                if hs_code and existing_product.hs_code != hs_code:
+                    existing_product.hs_code = hs_code
+                    existing_product.primary_category = best_hs_desc
+                    db.session.commit()
+                    results['updated'] += 1
+                    processed_entry['status'] = 'Updated'
+                else:
+                    results['skipped_duplicates'] += 1
+                    processed_entry['status'] = 'Skipped (Duplicate)'
+            else:
+                # Add new product
+                new_product = Product(
+                    name=cleaned_name,
+                    hs_code=hs_code,
+                    primary_category=best_hs_desc or 'N/A'
+                )
+                db.session.add(new_product)
+                db.session.commit()
+                results['added'] += 1
+                processed_entry['status'] = 'Added'
+            results['processed_data'].append(processed_entry)
+        except IntegrityError:
+            db.session.rollback()
+            log.warning(f"Integrity error for '{cleaned_name}', likely a race condition. Skipping.")
+            results['skipped_duplicates'] += 1
+        except Exception as e:
+            db.session.rollback()
+            log.error(f"Database error for '{cleaned_name}': {e}")
+            results['errors'].append(f"DB Error on '{cleaned_name}': {e}")
+    return results
+# ───────────────────────────────────────────────────────────────────────────────
+# ROUTES
+# ───────────────────────────────────────────────────────────────────────────────
+@app.get("/")
+def root():
+    return jsonify({"ok": True, "message": "The Product Validation server is running."})
+@app.post("/api/upload")
+def upload_products():
+    """Endpoint to upload and process a product CSV file."""
+    if 'file' not in request.files:
+        return jsonify({"ok": False, "error": "No file part in the request"}), 400
+    file = request.files['file']
+    if file.filename == '':
+        return jsonify({"ok": False, "error": "No file selected"}), 400
+    if file and file.filename.endswith('.csv'):
+        filename = secure_filename(file.filename)
+        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+        file.save(filepath)
+        results = process_uploaded_file(filepath)
+        return jsonify({"ok": True, "message": "File processed successfully", "results": results})
+    return jsonify({"ok": False, "error": "Invalid file type. Please upload a CSV."}), 400
+@app.get("/api/products")
+def get_products():
+    """Endpoint to retrieve all processed products from the database."""
+    log.info("Request received to fetch all products.")
+    try:
+        all_products = Product.query.all()
+        # Use the to_dict() method to serialize each product object
+        products_list = [product.to_dict() for product in all_products]
+        log.info(f"Successfully retrieved {len(products_list)} products.")
+        return jsonify({"ok": True, "count": len(products_list), "products": products_list})
+    except Exception as e:
+        log.error(f"Could not retrieve products from database: {e}")
+        return jsonify({"ok": False, "error": "Failed to retrieve products from the database."}), 500
+# ───────────────────────────────────────────────────────────────────────────────
+# MAIN (Server Initialization)
+# ───────────────────────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    with app.app_context():
+        log.info("Initializing server...")
+        # Create database tables based on the model
+        db.create_all()
+        # Load validation data into memory
+        HS_CODES_DATA = parse_hs_codes_pdf()
+        EXISTING_PRODUCT_NAMES = load_existing_products()
+        log.info("Server is ready and validation data is loaded.")
+    port = int(os.environ.get("PORT", "7860"))
+    app.run(host="0.0.0.0", port=port, debug=False)