Spaces:

rairo
/

iris-products-api

Sleeping

App Files Files Community

rairo commited on Sep 20, 2025

Commit

b6ed410

verified ·

1 Parent(s): daddd99

Update main.py

Browse files

Files changed (1) hide show

main.py +72 -113

main.py CHANGED Viewed

@@ -21,14 +21,20 @@ log = logging.getLogger("product-pipeline-api")
 app = Flask(__name__)
 CORS(app)
-# --- Database Configuration (Mocking MySQL with SQLite) ---
-# Use an in-memory SQLite database for simplicity and portability.
-# This mimics the real database without requiring a MySQL server for development.
-app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///:memory:'
 app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
 app.config['UPLOAD_FOLDER'] = 'uploads'
 os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
 db = SQLAlchemy(app)
 # ───────────────────────────────────────────────────────────────────────────────
@@ -61,29 +67,23 @@ class Product(db.Model):
 # DATA LOADING & PRE-PROCESSING
 # ───────────────────────────────────────────────────────────────────────────────
-# --- Constants for Validation Logic ---
-FUZZY_MATCH_THRESHOLD = 85  # Similarity score (out of 100) to consider a match
-# --- In-memory cache for essential data ---
 HS_CODES_DATA = []
 EXISTING_PRODUCT_NAMES = []
 HS_CODE_DESCRIPTIONS = {}
 def parse_hs_codes_pdf(filepath='HS Codes for use under FDMS.pdf'):
-    """Extracts HS codes and descriptions from the PDF file."""
     log.info(f"Parsing HS Codes from '{filepath}'...")
     if not os.path.exists(filepath):
         log.error(f"HS Code PDF not found at '{filepath}'. Categorization will fail.")
         return []
     codes = []
     try:
         with pdfplumber.open(filepath) as pdf:
             for page in pdf.pages:
                 text = page.extract_text()
-                # Regex to find HS codes and their descriptions.
-                # It looks for a pattern of numbers (code) followed by text (description).
-                matches = re.findall(r'\"(\d+)\n\"\,?\"(.*?)\n\"', text, re.DOTALL)
                 for code, desc in matches:
                     clean_desc = desc.replace('\n', ' ').strip()
                     if code and clean_desc:
@@ -95,15 +95,13 @@ def parse_hs_codes_pdf(filepath='HS Codes for use under FDMS.pdf'):
     return codes
 def load_existing_products(filepath='Product List.csv'):
-    """Loads the master product list for validation."""
     log.info(f"Loading master product list from '{filepath}'...")
     if not os.path.exists(filepath):
         log.error(f"Master product list not found at '{filepath}'. Validation may be inaccurate.")
         return []
     try:
-        df = pd.read_csv(filepath)
-        # Drop duplicates to ensure a clean list for matching
         product_names = df['name'].dropna().unique().tolist()
         log.info(f"Loaded {len(product_names)} unique existing products.")
         return product_names
@@ -115,153 +113,118 @@ def load_existing_products(filepath='Product List.csv'):
 # CORE PROCESSING PIPELINE
 # ───────────────────────────────────────────────────────────────────────────────
-def process_uploaded_file(filepath):
-    """
-    The main pipeline to validate, clean, categorize, and store product data.
-    """
     log.info(f"Starting processing for file: {filepath}")
     results = {
-        "processed": 0,
-        "added": 0,
-        "updated": 0,
-        "skipped_duplicates": 0,
-        "errors": [],
-        "processed_data": []
     }
     try:
-        # The uploaded file might not have headers, so we name the column
-        df = pd.read_csv(filepath, header=None, names=['product_name_raw'])
     except Exception as e:
-        log.error(f"Could not read CSV: {e}")
-        results['errors'].append(f"Invalid CSV format: {e}")
         return results
-    # Extract the column with product names, even if its index is not 0
-    product_name_col = None
-    for col in df.columns:
-        # Heuristic: find the column that seems to contain product names (string type)
-        if df[col].dtype == 'object' and df[col].astype(str).str.contains('[a-zA-Z]').any():
-            product_name_col = col
-            break
-    if product_name_col is None:
-        results['errors'].append("Could not find a column with product names in the uploaded CSV.")
         return results
     for index, row in df.iterrows():
-        raw_name = row[product_name_col]
         results['processed'] += 1
         if not isinstance(raw_name, str) or not raw_name.strip():
-            continue # Skip empty rows
-        # --- 1. Validation & Cleaning using Fuzzy Matching ---
-        # Find the closest match from the master product list
-        best_match, score = process.extractOne(
-            raw_name, EXISTING_PRODUCT_NAMES, scorer=fuzz.token_sort_ratio
-        ) if EXISTING_PRODUCT_NAMES else (raw_name, 100)
-        cleaned_name = best_match if score >= FUZZY_MATCH_THRESHOLD else raw_name
-        log.info(f"'{raw_name}' -> '{cleaned_name}' (Score: {score})")
-        # --- 2. HS Code Categorization ---
-        # Find the best HS code description match for the cleaned name
         best_hs_desc, _ = process.extractOne(
-            cleaned_name, HS_CODE_DESCRIPTIONS.keys()
         ) if HS_CODE_DESCRIPTIONS else (None, 0)
         hs_code = HS_CODE_DESCRIPTIONS.get(best_hs_desc)
-        log.info(f"Assigned HS Code: {hs_code} (Based on: '{best_hs_desc}')")
-        # --- 3. Database Operation ---
         processed_entry = {
-            "raw_name": raw_name,
-            "cleaned_name": cleaned_name,
-            "hs_code": hs_code,
-            "primary_category": best_hs_desc or "N/A",
-            "status": ""
         }
         try:
-            # Check if a product with this cleaned name already exists
-            existing_product = Product.query.filter_by(name=cleaned_name).first()
-            if existing_product:
-                # Update existing product if HS code is new
-                if hs_code and existing_product.hs_code != hs_code:
-                    existing_product.hs_code = hs_code
-                    existing_product.primary_category = best_hs_desc
-                    db.session.commit()
-                    results['updated'] += 1
-                    processed_entry['status'] = 'Updated'
                 else:
-                    results['skipped_duplicates'] += 1
-                    processed_entry['status'] = 'Skipped (Duplicate)'
-            else:
-                # Add new product
-                new_product = Product(
-                    name=cleaned_name,
-                    hs_code=hs_code,
-                    primary_category=best_hs_desc or 'N/A'
-                )
-                db.session.add(new_product)
-                db.session.commit()
-                results['added'] += 1
-                processed_entry['status'] = 'Added'
-            results['processed_data'].append(processed_entry)
-        except IntegrityError:
-            db.session.rollback()
-            log.warning(f"Integrity error for '{cleaned_name}', likely a race condition. Skipping.")
-            results['skipped_duplicates'] += 1
         except Exception as e:
             db.session.rollback()
-            log.error(f"Database error for '{cleaned_name}': {e}")
-            results['errors'].append(f"DB Error on '{cleaned_name}': {e}")
     return results
 # ───────────────────────────────────────────────────────────────────────────────
 # ROUTES
 # ───────────────────────────────────────────────────────────────────────────────
 @app.get("/")
 def root():
     return jsonify({"ok": True, "message": "The Product Validation server is running."})
 @app.post("/api/upload")
 def upload_products():
-    """Endpoint to upload and process a product CSV file."""
     if 'file' not in request.files:
         return jsonify({"ok": False, "error": "No file part in the request"}), 400
     file = request.files['file']
     if file.filename == '':
         return jsonify({"ok": False, "error": "No file selected"}), 400
-    if file and file.filename.endswith('.csv'):
         filename = secure_filename(file.filename)
         filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
         file.save(filepath)
-        results = process_uploaded_file(filepath)
         return jsonify({"ok": True, "message": "File processed successfully", "results": results})
-    return jsonify({"ok": False, "error": "Invalid file type. Please upload a CSV."}), 400
 @app.get("/api/products")
 def get_products():
-    """Endpoint to retrieve all processed products from the database."""
     log.info("Request received to fetch all products.")
     try:
         all_products = Product.query.all()
-        # Use the to_dict() method to serialize each product object
         products_list = [product.to_dict() for product in all_products]
         log.info(f"Successfully retrieved {len(products_list)} products.")
         return jsonify({"ok": True, "count": len(products_list), "products": products_list})
@@ -269,7 +232,6 @@ def get_products():
         log.error(f"Could not retrieve products from database: {e}")
         return jsonify({"ok": False, "error": "Failed to retrieve products from the database."}), 500
 # ───────────────────────────────────────────────────────────────────────────────
 # MAIN (Server Initialization)
 # ───────────────────────────────────────────────────────────────────────────────
@@ -277,13 +239,10 @@ def get_products():
 if __name__ == "__main__":
     with app.app_context():
         log.info("Initializing server...")
-        # Create database tables based on the model
         db.create_all()
-        # Load validation data into memory
         HS_CODES_DATA = parse_hs_codes_pdf()
         EXISTING_PRODUCT_NAMES = load_existing_products()
-        log.info("Server is ready and validation data is loaded.")
     port = int(os.environ.get("PORT", "7860"))
     app.run(host="0.0.0.0", port=port, debug=False)

 app = Flask(__name__)
 CORS(app)
+# --- App Configuration ---
+# --- FIX 1: Switched to a persistent file-based SQLite database ---
+# This ensures data survives between requests on Hugging Face Spaces.
+DB_FOLDER = 'data'
+DB_PATH = os.path.join(DB_FOLDER, 'products.db')
+os.makedirs(DB_FOLDER, exist_ok=True)
+app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{DB_PATH}'
 app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
 app.config['UPLOAD_FOLDER'] = 'uploads'
 os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+# --- File Upload Configuration ---
+ALLOWED_EXTENSIONS = {'csv', 'xls', 'xlsx'}
 db = SQLAlchemy(app)
 # ───────────────────────────────────────────────────────────────────────────────
 # DATA LOADING & PRE-PROCESSING
 # ───────────────────────────────────────────────────────────────────────────────
+FUZZY_MATCH_THRESHOLD = 85
 HS_CODES_DATA = []
 EXISTING_PRODUCT_NAMES = []
 HS_CODE_DESCRIPTIONS = {}
 def parse_hs_codes_pdf(filepath='HS Codes for use under FDMS.pdf'):
     log.info(f"Parsing HS Codes from '{filepath}'...")
     if not os.path.exists(filepath):
         log.error(f"HS Code PDF not found at '{filepath}'. Categorization will fail.")
         return []
     codes = []
     try:
         with pdfplumber.open(filepath) as pdf:
             for page in pdf.pages:
                 text = page.extract_text()
+                # Improved regex to handle variations in PDF formatting
+                matches = re.findall(r'\"(\d{8})\"\s*,\s*\"(.*?)\"', text, re.DOTALL)
                 for code, desc in matches:
                     clean_desc = desc.replace('\n', ' ').strip()
                     if code and clean_desc:
     return codes
 def load_existing_products(filepath='Product List.csv'):
     log.info(f"Loading master product list from '{filepath}'...")
     if not os.path.exists(filepath):
         log.error(f"Master product list not found at '{filepath}'. Validation may be inaccurate.")
         return []
     try:
+        # Based on the CSV structure, the 'name' is in the second column.
+        df = pd.read_csv(filepath, usecols=[1], names=['name'], header=0)
         product_names = df['name'].dropna().unique().tolist()
         log.info(f"Loaded {len(product_names)} unique existing products.")
         return product_names
 # CORE PROCESSING PIPELINE
 # ───────────────────────────────────────────────────────────────────────────────
+def process_uploaded_file(filepath, filename):
+    """The main pipeline to validate, clean, categorize, and store product data."""
     log.info(f"Starting processing for file: {filepath}")
     results = {
+        "processed": 0, "added": 0, "updated": 0, "skipped_duplicates": 0,
+        "errors": [], "processed_data": []
     }
+    df = None
     try:
+        file_ext = filename.rsplit('.', 1)[1].lower()
+        # --- FIX 2: Robustly parse the second column (index 1) for names ---
+        # The user's uploaded `list.csv` clearly has the product name in the second column.
+        if file_ext == 'csv':
+            df = pd.read_csv(filepath, header=None, usecols=[1], names=['product_name'])
+        elif file_ext in ['xls', 'xlsx']:
+            df = pd.read_excel(filepath, header=None, usecols=[1], names=['product_name'], engine='openpyxl')
+    except ValueError:
+         results['errors'].append("Could not find the product name column. Ensure the product name is in the second column.")
+         return results
     except Exception as e:
+        log.error(f"Could not read the uploaded file: {e}")
+        results['errors'].append(f"Invalid file format or corrupt file: {e}")
         return results
+    if df.empty:
+        results['errors'].append("The uploaded file is empty.")
         return results
     for index, row in df.iterrows():
+        raw_name = row['product_name']
         results['processed'] += 1
         if not isinstance(raw_name, str) or not raw_name.strip():
+            continue
+        cleaned_name = raw_name.strip()
+        best_match, score = process.extractOne(
+            cleaned_name, EXISTING_PRODUCT_NAMES, scorer=fuzz.token_sort_ratio
+        ) if EXISTING_PRODUCT_NAMES else (cleaned_name, 100)
+        validated_name = best_match if score >= FUZZY_MATCH_THRESHOLD else cleaned_name
         best_hs_desc, _ = process.extractOne(
+            validated_name, HS_CODE_DESCRIPTIONS.keys()
         ) if HS_CODE_DESCRIPTIONS else (None, 0)
         hs_code = HS_CODE_DESCRIPTIONS.get(best_hs_desc)
         processed_entry = {
+            "raw_name": raw_name, "cleaned_name": validated_name, "hs_code": hs_code,
+            "primary_category": best_hs_desc or "N/A", "status": ""
         }
         try:
+            # Each operation needs its own app context to interact with the database
+            with app.app_context():
+                existing_product = Product.query.filter_by(name=validated_name).first()
+                if existing_product:
+                    if hs_code and existing_product.hs_code != hs_code:
+                        existing_product.hs_code = hs_code
+                        existing_product.primary_category = best_hs_desc
+                        db.session.commit()
+                        results['updated'] += 1
+                        processed_entry['status'] = 'Updated'
+                    else:
+                        results['skipped_duplicates'] += 1
+                        processed_entry['status'] = 'Skipped (Duplicate)'
                 else:
+                    new_product = Product(name=validated_name, hs_code=hs_code, primary_category=best_hs_desc or 'N/A')
+                    db.session.add(new_product)
+                    db.session.commit()
+                    results['added'] += 1
+                    processed_entry['status'] = 'Added'
+                results['processed_data'].append(processed_entry)
         except Exception as e:
             db.session.rollback()
+            log.error(f"Database error for '{validated_name}': {e}")
+            results['errors'].append(f"DB Error on '{validated_name}': {e}")
     return results
 # ───────────────────────────────────────────────────────────────────────────────
 # ROUTES
 # ───────────────────────────────────────────────────────────────────────────────
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
 @app.get("/")
 def root():
     return jsonify({"ok": True, "message": "The Product Validation server is running."})
 @app.post("/api/upload")
 def upload_products():
     if 'file' not in request.files:
         return jsonify({"ok": False, "error": "No file part in the request"}), 400
     file = request.files['file']
     if file.filename == '':
         return jsonify({"ok": False, "error": "No file selected"}), 400
+    if file and allowed_file(file.filename):
         filename = secure_filename(file.filename)
         filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
         file.save(filepath)
+        results = process_uploaded_file(filepath, filename)
         return jsonify({"ok": True, "message": "File processed successfully", "results": results})
+    return jsonify({"ok": False, "error": f"Invalid file type. Allowed types are: {', '.join(ALLOWED_EXTENSIONS)}"}), 400
 @app.get("/api/products")
 def get_products():
     log.info("Request received to fetch all products.")
     try:
         all_products = Product.query.all()
         products_list = [product.to_dict() for product in all_products]
         log.info(f"Successfully retrieved {len(products_list)} products.")
         return jsonify({"ok": True, "count": len(products_list), "products": products_list})
         log.error(f"Could not retrieve products from database: {e}")
         return jsonify({"ok": False, "error": "Failed to retrieve products from the database."}), 500
 # ───────────────────────────────────────────────────────────────────────────────
 # MAIN (Server Initialization)
 # ───────────────────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
     with app.app_context():
         log.info("Initializing server...")
         db.create_all()
         HS_CODES_DATA = parse_hs_codes_pdf()
         EXISTING_PRODUCT_NAMES = load_existing_products()
+        log.info(f"Server is ready. Database is at: {DB_PATH}")
     port = int(os.environ.get("PORT", "7860"))
     app.run(host="0.0.0.0", port=port, debug=False)