rairo commited on
Commit
a2c8a5f
·
verified ·
1 Parent(s): 203f22b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +19 -24
main.py CHANGED
@@ -68,34 +68,29 @@ class Product(db.Model):
68
  # ───────────────────────────────────────────────────────────────────────────────
69
 
70
  FUZZY_MATCH_THRESHOLD = 85
71
- HS_CODES_DATA = []
72
  EXISTING_PRODUCT_NAMES = []
73
  HS_CODE_DESCRIPTIONS = {}
74
 
75
- def parse_hs_codes_pdf(filepath='HS Codes for use under FDMS.pdf'):
76
- log.info(f"Parsing HS Codes from '{filepath}'...")
77
- if not os.path.exists(filepath):
78
- log.error(f"HS Code PDF not found at '{filepath}'. Categorization will fail.")
79
- return []
80
- codes = []
 
81
  try:
82
- with pdfplumber.open(filepath) as pdf:
83
- for page in pdf.pages:
84
- text = page.extract_text()
85
- if not text:
86
- continue
87
- # --- FIX: Made the regex more robust to handle newlines inside quotes ---
88
- # This was the cause of parsing 0 HS codes.
89
- matches = re.findall(r'\"(\d{8})\s*\"\s*,\s*\"(.*?)\s*\"', text, re.DOTALL)
90
- for code, desc in matches:
91
- clean_desc = desc.replace('\n', ' ').strip()
92
- if code and clean_desc:
93
- codes.append({'code': code, 'description': clean_desc})
94
- HS_CODE_DESCRIPTIONS[clean_desc] = code
95
  except Exception as e:
96
- log.error(f"Failed to parse PDF: {e}")
97
- log.info(f"Successfully parsed {len(codes)} HS codes.")
98
- return codes
99
 
100
  def load_existing_products(filepath='Product List.csv'):
101
  log.info(f"Loading master product list from '{filepath}'...")
@@ -238,7 +233,7 @@ if __name__ == "__main__":
238
  with app.app_context():
239
  log.info("Initializing server...")
240
  db.create_all()
241
- HS_CODES_DATA = parse_hs_codes_pdf()
242
  EXISTING_PRODUCT_NAMES = load_existing_products()
243
  log.info(f"Server is ready. Database is at: {DB_PATH}")
244
 
 
68
  # ───────────────────────────────────────────────────────────────────────────────
69
 
70
  FUZZY_MATCH_THRESHOLD = 85
 
71
  EXISTING_PRODUCT_NAMES = []
72
  HS_CODE_DESCRIPTIONS = {}
73
 
74
+ # --- FIX: Replaced the entire PDF parsing logic with a simple, robust CSV reader ---
75
+ def load_hs_codes(filename="HS_Codes_for_use_under_FDMS.xlsx - Table 1.csv"):
76
+ """Loads HS codes from the user-provided clean CSV file."""
77
+ log.info(f"Loading HS Codes from '{filename}'...")
78
+ if not os.path.exists(filename):
79
+ log.error(f"HS Code file not found at '{filename}'. Categorization will fail.")
80
+ return
81
  try:
82
+ df = pd.read_csv(filename)
83
+ # Ensure the column names are correct
84
+ df.columns = ['HS CODE', 'GOODS DESCRIPTION']
85
+ for _, row in df.iterrows():
86
+ code = str(row['HS CODE']).strip()
87
+ desc = str(row['GOODS DESCRIPTION']).strip()
88
+ if code and desc and code != 'nan' and desc != 'nan':
89
+ HS_CODE_DESCRIPTIONS[desc] = code
90
+ log.info(f"Successfully parsed {len(HS_CODE_DESCRIPTIONS)} HS codes from CSV.")
 
 
 
 
91
  except Exception as e:
92
+ log.error(f"Failed to load HS codes from CSV: {e}")
93
+
 
94
 
95
  def load_existing_products(filepath='Product List.csv'):
96
  log.info(f"Loading master product list from '{filepath}'...")
 
233
  with app.app_context():
234
  log.info("Initializing server...")
235
  db.create_all()
236
+ load_hs_codes() # Call the new, corrected function
237
  EXISTING_PRODUCT_NAMES = load_existing_products()
238
  log.info(f"Server is ready. Database is at: {DB_PATH}")
239