Spaces:

rairo
/

iris-products-api

Sleeping

App Files Files Community

rairo commited on Sep 20, 2025

Commit

a2c8a5f

verified ·

1 Parent(s): 203f22b

Update main.py

Browse files

Files changed (1) hide show

main.py +19 -24

main.py CHANGED Viewed

@@ -68,34 +68,29 @@ class Product(db.Model):
 # ───────────────────────────────────────────────────────────────────────────────
 FUZZY_MATCH_THRESHOLD = 85
-HS_CODES_DATA = []
 EXISTING_PRODUCT_NAMES = []
 HS_CODE_DESCRIPTIONS = {}
-def parse_hs_codes_pdf(filepath='HS Codes for use under FDMS.pdf'):
-    log.info(f"Parsing HS Codes from '{filepath}'...")
-    if not os.path.exists(filepath):
-        log.error(f"HS Code PDF not found at '{filepath}'. Categorization will fail.")
-        return []
-    codes = []
     try:
-        with pdfplumber.open(filepath) as pdf:
-            for page in pdf.pages:
-                text = page.extract_text()
-                if not text:
-                    continue
-                # --- FIX: Made the regex more robust to handle newlines inside quotes ---
-                # This was the cause of parsing 0 HS codes.
-                matches = re.findall(r'\"(\d{8})\s*\"\s*,\s*\"(.*?)\s*\"', text, re.DOTALL)
-                for code, desc in matches:
-                    clean_desc = desc.replace('\n', ' ').strip()
-                    if code and clean_desc:
-                        codes.append({'code': code, 'description': clean_desc})
-                        HS_CODE_DESCRIPTIONS[clean_desc] = code
     except Exception as e:
-        log.error(f"Failed to parse PDF: {e}")
-    log.info(f"Successfully parsed {len(codes)} HS codes.")
-    return codes
 def load_existing_products(filepath='Product List.csv'):
     log.info(f"Loading master product list from '{filepath}'...")
@@ -238,7 +233,7 @@ if __name__ == "__main__":
     with app.app_context():
         log.info("Initializing server...")
         db.create_all()
-        HS_CODES_DATA = parse_hs_codes_pdf()
         EXISTING_PRODUCT_NAMES = load_existing_products()
         log.info(f"Server is ready. Database is at: {DB_PATH}")

 # ───────────────────────────────────────────────────────────────────────────────
 FUZZY_MATCH_THRESHOLD = 85
 EXISTING_PRODUCT_NAMES = []
 HS_CODE_DESCRIPTIONS = {}
+# --- FIX: Replaced the entire PDF parsing logic with a simple, robust CSV reader ---
+def load_hs_codes(filename="HS_Codes_for_use_under_FDMS.xlsx - Table 1.csv"):
+    """Loads HS codes from the user-provided clean CSV file."""
+    log.info(f"Loading HS Codes from '{filename}'...")
+    if not os.path.exists(filename):
+        log.error(f"HS Code file not found at '{filename}'. Categorization will fail.")
+        return
     try:
+        df = pd.read_csv(filename)
+        # Ensure the column names are correct
+        df.columns = ['HS CODE', 'GOODS DESCRIPTION']
+        for _, row in df.iterrows():
+            code = str(row['HS CODE']).strip()
+            desc = str(row['GOODS DESCRIPTION']).strip()
+            if code and desc and code != 'nan' and desc != 'nan':
+                HS_CODE_DESCRIPTIONS[desc] = code
+        log.info(f"Successfully parsed {len(HS_CODE_DESCRIPTIONS)} HS codes from CSV.")
     except Exception as e:
+        log.error(f"Failed to load HS codes from CSV: {e}")
 def load_existing_products(filepath='Product List.csv'):
     log.info(f"Loading master product list from '{filepath}'...")
     with app.app_context():
         log.info("Initializing server...")
         db.create_all()
+        load_hs_codes() # Call the new, corrected function
         EXISTING_PRODUCT_NAMES = load_existing_products()
         log.info(f"Server is ready. Database is at: {DB_PATH}")