rairo commited on
Commit
5d5ed5e
·
verified ·
1 Parent(s): 6affcb6

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +228 -346
main.py CHANGED
@@ -11,7 +11,6 @@ from sqlalchemy.exc import IntegrityError
11
  from thefuzz import process, fuzz
12
  from werkzeug.utils import secure_filename
13
  import tempfile
14
- import stat
15
 
16
  # ───────────────────────────────────────────────────────────────────────────────
17
  # CONFIGURATION
@@ -23,95 +22,89 @@ log = logging.getLogger("product-pipeline-api")
23
  app = Flask(__name__)
24
  CORS(app)
25
 
26
- # --- Database Path Resolution with Fallbacks ---
27
- def get_database_path():
28
- """
29
- Robust database path resolution with multiple fallbacks for Hugging Face Spaces.
30
- """
31
- # Try different paths in order of preference
32
- potential_paths = [
33
- # Standard data directory
34
- os.path.join('data', 'products.db'),
35
- # Temp directory (usually writable)
36
- os.path.join(tempfile.gettempdir(), 'products.db'),
37
- # Current working directory
38
- 'products.db',
39
- # User home directory
40
- os.path.expanduser('~/products.db'),
41
- # In-memory database as last resort
42
- ':memory:'
43
- ]
44
 
45
- for db_path in potential_paths:
46
- if db_path == ':memory:':
47
- log.warning("Using in-memory database - data will not persist between restarts")
48
- return db_path
49
-
50
- try:
51
- # Create directory if it doesn't exist
52
- db_dir = os.path.dirname(db_path)
53
- if db_dir and not os.path.exists(db_dir):
54
- os.makedirs(db_dir, exist_ok=True)
55
-
56
- # Test write permissions
57
- test_file = db_path + '.test'
58
- with open(test_file, 'w') as f:
59
- f.write('test')
60
- os.remove(test_file)
61
-
62
- log.info(f"Selected database path: {os.path.abspath(db_path)}")
63
- return db_path
64
-
65
- except (OSError, PermissionError) as e:
66
- log.warning(f"Cannot use database path {db_path}: {e}")
67
- continue
68
 
69
- # Should never reach here due to :memory: fallback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  return ':memory:'
71
 
72
- # --- App Configuration ---
73
- DB_PATH = get_database_path()
74
  app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{DB_PATH}'
75
  app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
76
 
77
- # --- Upload Folder Configuration ---
78
- def get_upload_folder():
79
- """Get a writable upload folder with fallbacks."""
80
- potential_folders = [
81
- 'uploads',
82
- os.path.join(tempfile.gettempdir(), 'uploads'),
83
- tempfile.mkdtemp(prefix='uploads_')
84
- ]
85
-
86
- for folder in potential_folders:
87
- try:
88
- os.makedirs(folder, exist_ok=True)
89
- # Test write permissions
90
- test_file = os.path.join(folder, 'test.tmp')
91
- with open(test_file, 'w') as f:
92
- f.write('test')
93
- os.remove(test_file)
94
- log.info(f"Using upload folder: {os.path.abspath(folder)}")
95
- return folder
96
- except (OSError, PermissionError) as e:
97
- log.warning(f"Cannot use upload folder {folder}: {e}")
98
- continue
99
-
100
- # Fallback to temp directory
101
- return tempfile.gettempdir()
102
 
103
- app.config['UPLOAD_FOLDER'] = get_upload_folder()
104
 
105
  # --- File Upload Configuration ---
106
  ALLOWED_EXTENSIONS = {'csv', 'xls', 'xlsx'}
107
 
108
- # --- Database Initialization with Error Handling ---
 
109
  try:
110
  db = SQLAlchemy(app)
111
  log.info("SQLAlchemy initialized successfully")
112
  except Exception as e:
113
  log.error(f"Failed to initialize SQLAlchemy: {e}")
114
- raise
 
 
 
115
 
116
  # ───────────────────────────────────────────────────────────────────────────────
117
  # DATABASE MODEL (Based on products-20.sql)
@@ -139,47 +132,6 @@ class Product(db.Model):
139
  def __repr__(self):
140
  return f'<Product {self.id}: {self.name}>'
141
 
142
- # ───────────────────────────────────────────────────────────────────────────────
143
- # DATABASE INITIALIZATION WITH ROBUST ERROR HANDLING
144
- # ───────────────────────────────────────────────────────────────────────────────
145
-
146
- def initialize_database():
147
- """Initialize database with comprehensive error handling."""
148
- max_retries = 3
149
- retry_count = 0
150
-
151
- while retry_count < max_retries:
152
- try:
153
- with app.app_context():
154
- # Test database connection first
155
- db.engine.execute('SELECT 1')
156
- log.info("Database connection test successful")
157
-
158
- # Create all tables
159
- db.create_all()
160
- log.info("Database tables created successfully")
161
- return True
162
-
163
- except Exception as e:
164
- retry_count += 1
165
- log.error(f"Database initialization attempt {retry_count} failed: {e}")
166
-
167
- if retry_count < max_retries:
168
- log.info(f"Retrying database initialization... (attempt {retry_count + 1}/{max_retries})")
169
-
170
- # Try fallback to in-memory database
171
- if retry_count == max_retries - 1:
172
- log.warning("Falling back to in-memory database")
173
- app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///:memory:'
174
- # Recreate the database instance
175
- global db
176
- db = SQLAlchemy(app)
177
- else:
178
- log.error("All database initialization attempts failed")
179
- return False
180
-
181
- return False
182
-
183
  # ───────────────────────────────────────────────────────────────────────────────
184
  # DATA LOADING & PRE-PROCESSING
185
  # ───────────────────────────────────────────────────────────────────────────────
@@ -190,86 +142,51 @@ EXISTING_PRODUCT_NAMES = []
190
  HS_CODE_DESCRIPTIONS = {}
191
 
192
  def parse_hs_codes_pdf(filepath='HS Codes for use under FDMS.pdf'):
193
- """Parse HS Codes from PDF with error handling."""
194
- log.info(f"Attempting to parse HS Codes from '{filepath}'...")
195
  if not os.path.exists(filepath):
196
  log.warning(f"HS Code PDF not found at '{filepath}'. Categorization will be limited.")
197
  return []
198
-
199
  codes = []
200
  try:
201
  with pdfplumber.open(filepath) as pdf:
202
- for page_num, page in enumerate(pdf.pages):
203
- try:
204
- text = page.extract_text()
205
- if not text:
206
- continue
207
-
208
- # Improved regex to handle variations in PDF formatting
209
- matches = re.findall(r'\"(\d{8})\"\s*,\s*\"(.*?)\"', text, re.DOTALL)
210
- for code, desc in matches:
211
- clean_desc = desc.replace('\n', ' ').strip()
212
- if code and clean_desc:
213
- codes.append({'code': code, 'description': clean_desc})
214
- HS_CODE_DESCRIPTIONS[clean_desc] = code
215
-
216
- except Exception as e:
217
- log.warning(f"Error processing page {page_num + 1}: {e}")
218
  continue
219
-
 
 
 
 
 
 
220
  except Exception as e:
221
- log.error(f"Failed to parse PDF '{filepath}': {e}")
222
-
223
- log.info(f"Successfully parsed {len(codes)} HS codes from PDF.")
224
  return codes
225
 
226
  def load_existing_products(filepath='Product List.csv'):
227
- """Load existing products with robust error handling."""
228
- log.info(f"Attempting to load master product list from '{filepath}'...")
229
  if not os.path.exists(filepath):
230
- log.warning(f"Master product list not found at '{filepath}'. Using database products only.")
231
  return []
232
-
233
  try:
234
- # Try different encodings
235
- encodings = ['utf-8', 'latin-1', 'cp1252']
236
- df = None
237
-
238
- for encoding in encodings:
239
  try:
240
- df = pd.read_csv(filepath, encoding=encoding)
241
- log.info(f"Successfully read CSV with {encoding} encoding")
242
- break
 
 
243
  except UnicodeDecodeError:
244
  continue
245
-
246
- if df is None:
247
- log.error("Failed to read CSV with any supported encoding")
248
- return []
249
-
250
- # Flexible column detection - try different approaches
251
- product_names = []
252
-
253
- if len(df.columns) >= 2:
254
- # Use second column if available
255
- product_names = df.iloc[:, 1].dropna().astype(str).unique().tolist()
256
- elif 'name' in df.columns:
257
- product_names = df['name'].dropna().astype(str).unique().tolist()
258
- elif 'product' in df.columns:
259
- product_names = df['product'].dropna().astype(str).unique().tolist()
260
- else:
261
- # Use first column as fallback
262
- product_names = df.iloc[:, 0].dropna().astype(str).unique().tolist()
263
-
264
- # Clean product names
265
- product_names = [name.strip() for name in product_names if name.strip()]
266
-
267
- log.info(f"Loaded {len(product_names)} unique existing products from CSV.")
268
- return product_names
269
-
270
  except Exception as e:
271
  log.error(f"Failed to load master product list: {e}")
272
- return []
273
 
274
  # ───────────────────────────────────────────────────────────────────────────────
275
  # CORE PROCESSING PIPELINE
@@ -282,127 +199,110 @@ def process_uploaded_file(filepath, filename):
282
  "processed": 0, "added": 0, "updated": 0, "skipped_duplicates": 0,
283
  "errors": [], "processed_data": []
284
  }
285
-
 
286
  try:
287
- # Read file with robust error handling
288
- df = None
289
  file_ext = filename.rsplit('.', 1)[1].lower() if '.' in filename else ''
290
 
291
- # Try different encodings and methods
292
- encodings = ['utf-8', 'latin-1', 'cp1252']
293
-
294
- for encoding in encodings:
295
- try:
296
- if file_ext == 'csv':
297
- df = pd.read_csv(filepath, encoding=encoding, header=None)
298
- elif file_ext in ['xls', 'xlsx']:
299
- df = pd.read_excel(filepath, header=None, engine='openpyxl')
300
-
301
- if df is not None:
302
- log.info(f"Successfully read file with {encoding if file_ext == 'csv' else 'excel'} format")
303
  break
304
-
305
- except Exception as e:
306
- log.warning(f"Failed to read with {encoding}: {e}")
307
- continue
308
-
309
- if df is None or df.empty:
310
- results['errors'].append("Could not read the uploaded file or file is empty")
311
- return results
312
-
313
- # Flexible column detection
314
- product_column_idx = None
315
- if len(df.columns) >= 2:
316
- product_column_idx = 1 # Second column
317
- else:
318
- product_column_idx = 0 # First column as fallback
319
-
320
- log.info(f"Using column index {product_column_idx} for product names")
321
-
322
- for index, row in df.iterrows():
323
- try:
324
- raw_name = row.iloc[product_column_idx] if len(row) > product_column_idx else None
325
- results['processed'] += 1
326
-
327
- if pd.isna(raw_name) or not str(raw_name).strip():
328
  continue
 
 
 
 
 
 
 
 
329
 
330
- cleaned_name = str(raw_name).strip()
331
-
332
- # Fuzzy matching with existing products
333
- best_match, score = (cleaned_name, 100)
334
- if EXISTING_PRODUCT_NAMES:
335
- try:
336
- best_match, score = process.extractOne(
337
- cleaned_name, EXISTING_PRODUCT_NAMES, scorer=fuzz.token_sort_ratio
338
- )
339
- except Exception as e:
340
- log.warning(f"Fuzzy matching failed for '{cleaned_name}': {e}")
341
-
342
- validated_name = best_match if score >= FUZZY_MATCH_THRESHOLD else cleaned_name
343
-
344
- # HS Code matching
345
- best_hs_desc = None
346
- hs_code = None
347
- if HS_CODE_DESCRIPTIONS:
348
- try:
349
- best_hs_desc, _ = process.extractOne(
350
- validated_name, list(HS_CODE_DESCRIPTIONS.keys())
351
- )
352
- hs_code = HS_CODE_DESCRIPTIONS.get(best_hs_desc)
353
- except Exception as e:
354
- log.warning(f"HS code matching failed for '{validated_name}': {e}")
355
-
356
- processed_entry = {
357
- "raw_name": str(raw_name),
358
- "cleaned_name": validated_name,
359
- "hs_code": hs_code,
360
- "primary_category": best_hs_desc or "N/A",
361
- "status": ""
362
- }
363
-
364
- # Database operations with error handling
365
  try:
366
- existing_product = Product.query.filter_by(name=validated_name).first()
367
- if existing_product:
368
- if hs_code and existing_product.hs_code != hs_code:
369
- existing_product.hs_code = hs_code
370
- existing_product.primary_category = best_hs_desc or "N/A"
371
- db.session.commit()
372
- results['updated'] += 1
373
- processed_entry['status'] = 'Updated'
374
- else:
375
- results['skipped_duplicates'] += 1
376
- processed_entry['status'] = 'Skipped (Duplicate)'
377
- else:
378
- new_product = Product(
379
- name=validated_name,
380
- hs_code=hs_code,
381
- primary_category=best_hs_desc or 'N/A'
382
- )
383
- db.session.add(new_product)
384
- db.session.commit()
385
- results['added'] += 1
386
- processed_entry['status'] = 'Added'
387
-
388
- results['processed_data'].append(processed_entry)
389
-
390
  except Exception as e:
391
- db.session.rollback()
392
- log.error(f"Database error for '{validated_name}': {e}")
393
- results['errors'].append(f"DB Error on '{validated_name}': {str(e)}")
394
- processed_entry['status'] = 'Error'
395
- results['processed_data'].append(processed_entry)
396
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  except Exception as e:
398
- log.error(f"Error processing row {index}: {e}")
399
- results['errors'].append(f"Row {index + 1} processing error: {str(e)}")
400
- continue
401
-
402
- except Exception as e:
403
- log.error(f"Critical error processing file: {e}")
404
- results['errors'].append(f"Critical processing error: {str(e)}")
405
-
 
 
406
  return results
407
 
408
  # ───────────────────────────────────────────────────────────────────────────────
@@ -417,35 +317,13 @@ def root():
417
  return jsonify({
418
  "ok": True,
419
  "message": "The Product Validation server is running.",
420
- "database_path": DB_PATH,
421
- "upload_folder": app.config['UPLOAD_FOLDER']
422
  })
423
 
424
- @app.get("/api/health")
425
- def health_check():
426
- """Health check endpoint."""
427
- try:
428
- # Test database connection
429
- with app.app_context():
430
- db.engine.execute('SELECT 1')
431
-
432
- return jsonify({
433
- "ok": True,
434
- "database": "connected",
435
- "products_count": Product.query.count()
436
- })
437
- except Exception as e:
438
- return jsonify({
439
- "ok": False,
440
- "database": "disconnected",
441
- "error": str(e)
442
- }), 500
443
-
444
  @app.post("/api/upload")
445
  def upload_products():
446
  if 'file' not in request.files:
447
  return jsonify({"ok": False, "error": "No file part in the request"}), 400
448
-
449
  file = request.files['file']
450
  if file.filename == '':
451
  return jsonify({"ok": False, "error": "No file selected"}), 400
@@ -455,32 +333,20 @@ def upload_products():
455
  filename = secure_filename(file.filename)
456
  filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
457
  file.save(filepath)
458
-
459
  results = process_uploaded_file(filepath, filename)
460
 
461
  # Clean up uploaded file
462
  try:
463
  os.remove(filepath)
464
  except:
465
- pass # Non-critical if cleanup fails
466
 
467
- return jsonify({
468
- "ok": True,
469
- "message": "File processed successfully",
470
- "results": results
471
- })
472
-
473
  except Exception as e:
474
- log.error(f"Upload processing error: {e}")
475
- return jsonify({
476
- "ok": False,
477
- "error": f"File processing failed: {str(e)}"
478
- }), 500
479
 
480
- return jsonify({
481
- "ok": False,
482
- "error": f"Invalid file type. Allowed types are: {', '.join(ALLOWED_EXTENSIONS)}"
483
- }), 400
484
 
485
  @app.get("/api/products")
486
  def get_products():
@@ -489,41 +355,57 @@ def get_products():
489
  all_products = Product.query.all()
490
  products_list = [product.to_dict() for product in all_products]
491
  log.info(f"Successfully retrieved {len(products_list)} products.")
492
- return jsonify({
493
- "ok": True,
494
- "count": len(products_list),
495
- "products": products_list
496
- })
497
  except Exception as e:
498
  log.error(f"Could not retrieve products from database: {e}")
499
- return jsonify({
500
- "ok": False,
501
- "error": f"Failed to retrieve products: {str(e)}"
502
- }), 500
503
 
504
  # ───────────────────────────────────────────────────────────────────────────────
505
  # MAIN (Server Initialization)
506
  # ───────────────────────────────────────────────────────────────────────────────
507
 
508
  if __name__ == "__main__":
509
- log.info("===== Application Startup at 2025-09-20 14:18:26 =====")
510
-
511
- # Initialize database with error handling
512
- if not initialize_database():
513
- log.error("Failed to initialize database. Server may not function properly.")
514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
  # Load supporting data (non-critical)
516
  try:
517
- log.info("Loading supporting data...")
518
  HS_CODES_DATA = parse_hs_codes_pdf()
519
  EXISTING_PRODUCT_NAMES = load_existing_products()
520
  log.info("Supporting data loaded successfully")
521
  except Exception as e:
522
  log.warning(f"Failed to load supporting data: {e}")
523
  log.info("Server will continue with limited functionality")
524
-
525
- log.info(f"Server initialization complete. Database: {DB_PATH}")
526
- log.info("Starting Flask development server...")
527
-
528
  port = int(os.environ.get("PORT", "7860"))
529
  app.run(host="0.0.0.0", port=port, debug=False)
 
11
  from thefuzz import process, fuzz
12
  from werkzeug.utils import secure_filename
13
  import tempfile
 
14
 
15
  # ───────────────────────────────────────────────────────────────────────────────
16
  # CONFIGURATION
 
22
  app = Flask(__name__)
23
  CORS(app)
24
 
25
+ # --- App Configuration ---
26
+ # --- ROBUST DATABASE PATH HANDLING ---
27
+ def setup_database_path():
28
+ """Setup database path with fallbacks for Hugging Face Spaces."""
29
+ # Try original path first
30
+ DB_FOLDER = 'data'
31
+ DB_PATH = os.path.join(DB_FOLDER, 'products.db')
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ try:
34
+ os.makedirs(DB_FOLDER, exist_ok=True)
35
+ # Test if we can write to this directory
36
+ test_file = os.path.join(DB_FOLDER, 'test.tmp')
37
+ with open(test_file, 'w') as f:
38
+ f.write('test')
39
+ os.remove(test_file)
40
+ log.info(f"Using database path: {DB_PATH}")
41
+ return DB_PATH
42
+ except (OSError, PermissionError):
43
+ log.warning(f"Cannot write to {DB_FOLDER}, trying fallbacks...")
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ # Fallback 1: Use temp directory
46
+ try:
47
+ temp_db_path = os.path.join(tempfile.gettempdir(), 'products.db')
48
+ test_file = os.path.join(tempfile.gettempdir(), 'test.tmp')
49
+ with open(test_file, 'w') as f:
50
+ f.write('test')
51
+ os.remove(test_file)
52
+ log.info(f"Using temp database path: {temp_db_path}")
53
+ return temp_db_path
54
+ except (OSError, PermissionError):
55
+ log.warning("Cannot write to temp directory, trying current directory...")
56
+
57
+ # Fallback 2: Current directory
58
+ try:
59
+ current_db_path = 'products.db'
60
+ test_file = 'test.tmp'
61
+ with open(test_file, 'w') as f:
62
+ f.write('test')
63
+ os.remove(test_file)
64
+ log.info(f"Using current directory database path: {current_db_path}")
65
+ return current_db_path
66
+ except (OSError, PermissionError):
67
+ log.warning("Cannot write to current directory, using in-memory database...")
68
+
69
+ # Final fallback: In-memory database
70
+ log.warning("Using in-memory database - data will not persist!")
71
  return ':memory:'
72
 
73
+ DB_PATH = setup_database_path()
 
74
  app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{DB_PATH}'
75
  app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
76
 
77
+ # --- ROBUST UPLOAD FOLDER HANDLING ---
78
+ def setup_upload_folder():
79
+ """Setup upload folder with fallbacks."""
80
+ try:
81
+ upload_folder = 'uploads'
82
+ os.makedirs(upload_folder, exist_ok=True)
83
+ test_file = os.path.join(upload_folder, 'test.tmp')
84
+ with open(test_file, 'w') as f:
85
+ f.write('test')
86
+ os.remove(test_file)
87
+ return upload_folder
88
+ except (OSError, PermissionError):
89
+ log.warning("Cannot create uploads folder, using temp directory")
90
+ return tempfile.gettempdir()
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ app.config['UPLOAD_FOLDER'] = setup_upload_folder()
93
 
94
  # --- File Upload Configuration ---
95
  ALLOWED_EXTENSIONS = {'csv', 'xls', 'xlsx'}
96
 
97
+ # --- ROBUST DATABASE INITIALIZATION ---
98
+ db = None
99
  try:
100
  db = SQLAlchemy(app)
101
  log.info("SQLAlchemy initialized successfully")
102
  except Exception as e:
103
  log.error(f"Failed to initialize SQLAlchemy: {e}")
104
+ # Try in-memory fallback
105
+ app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///:memory:'
106
+ db = SQLAlchemy(app)
107
+ log.warning("Fell back to in-memory database")
108
 
109
  # ───────────────────────────────────────────────────────────────────────────────
110
  # DATABASE MODEL (Based on products-20.sql)
 
132
  def __repr__(self):
133
  return f'<Product {self.id}: {self.name}>'
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  # ───────────────────────────────────────────────────────────────────────────────
136
  # DATA LOADING & PRE-PROCESSING
137
  # ───────────────────────────────────────────────────────────────────────────────
 
142
  HS_CODE_DESCRIPTIONS = {}
143
 
144
  def parse_hs_codes_pdf(filepath='HS Codes for use under FDMS.pdf'):
145
+ log.info(f"Parsing HS Codes from '{filepath}'...")
 
146
  if not os.path.exists(filepath):
147
  log.warning(f"HS Code PDF not found at '{filepath}'. Categorization will be limited.")
148
  return []
 
149
  codes = []
150
  try:
151
  with pdfplumber.open(filepath) as pdf:
152
+ for page in pdf.pages:
153
+ text = page.extract_text()
154
+ if not text:
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  continue
156
+ # Improved regex to handle variations in PDF formatting
157
+ matches = re.findall(r'\"(\d{8})\"\s*,\s*\"(.*?)\"', text, re.DOTALL)
158
+ for code, desc in matches:
159
+ clean_desc = desc.replace('\n', ' ').strip()
160
+ if code and clean_desc:
161
+ codes.append({'code': code, 'description': clean_desc})
162
+ HS_CODE_DESCRIPTIONS[clean_desc] = code
163
  except Exception as e:
164
+ log.error(f"Failed to parse PDF: {e}")
165
+ log.info(f"Successfully parsed {len(codes)} HS codes.")
 
166
  return codes
167
 
168
  def load_existing_products(filepath='Product List.csv'):
169
+ log.info(f"Loading master product list from '{filepath}'...")
 
170
  if not os.path.exists(filepath):
171
+ log.warning(f"Master product list not found at '{filepath}'. Validation may be limited.")
172
  return []
 
173
  try:
174
+ # Try multiple encodings
175
+ for encoding in ['utf-8', 'latin-1', 'cp1252']:
 
 
 
176
  try:
177
+ # Based on the CSV structure, the 'name' is in the second column.
178
+ df = pd.read_csv(filepath, usecols=[1], names=['name'], header=0, encoding=encoding)
179
+ product_names = df['name'].dropna().astype(str).unique().tolist()
180
+ log.info(f"Loaded {len(product_names)} unique existing products with {encoding} encoding.")
181
+ return product_names
182
  except UnicodeDecodeError:
183
  continue
184
+ except Exception as e:
185
+ log.warning(f"Error with {encoding} encoding: {e}")
186
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  except Exception as e:
188
  log.error(f"Failed to load master product list: {e}")
189
+ return []
190
 
191
  # ───────────────────────────────────────────────────────────────────────────────
192
  # CORE PROCESSING PIPELINE
 
199
  "processed": 0, "added": 0, "updated": 0, "skipped_duplicates": 0,
200
  "errors": [], "processed_data": []
201
  }
202
+ df = None
203
+
204
  try:
 
 
205
  file_ext = filename.rsplit('.', 1)[1].lower() if '.' in filename else ''
206
 
207
+ # Try multiple encodings for CSV files
208
+ if file_ext == 'csv':
209
+ for encoding in ['utf-8', 'latin-1', 'cp1252']:
210
+ try:
211
+ df = pd.read_csv(filepath, header=None, usecols=[1], names=['product_name'], encoding=encoding)
212
+ log.info(f"Successfully read CSV with {encoding} encoding")
 
 
 
 
 
 
213
  break
214
+ except (UnicodeDecodeError, ValueError):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  continue
216
+ except Exception as e:
217
+ log.warning(f"Error reading CSV with {encoding}: {e}")
218
+ continue
219
+ elif file_ext in ['xls', 'xlsx']:
220
+ try:
221
+ df = pd.read_excel(filepath, header=None, usecols=[1], names=['product_name'], engine='openpyxl')
222
+ except Exception as e:
223
+ log.error(f"Error reading Excel file: {e}")
224
 
225
+ if df is None:
226
+ results['errors'].append("Could not read the uploaded file with any supported encoding.")
227
+ return results
228
+
229
+ except Exception as e:
230
+ log.error(f"Could not read the uploaded file: {e}")
231
+ results['errors'].append(f"Invalid file format or corrupt file: {e}")
232
+ return results
233
+
234
+ if df.empty:
235
+ results['errors'].append("The uploaded file is empty.")
236
+ return results
237
+
238
+ for index, row in df.iterrows():
239
+ try:
240
+ raw_name = row['product_name']
241
+ results['processed'] += 1
242
+
243
+ if pd.isna(raw_name) or not str(raw_name).strip():
244
+ continue
245
+
246
+ cleaned_name = str(raw_name).strip()
247
+
248
+ # Fuzzy matching with error handling
249
+ best_match, score = (cleaned_name, 100)
250
+ if EXISTING_PRODUCT_NAMES:
 
 
 
 
 
 
 
 
 
251
  try:
252
+ best_match, score = process.extractOne(
253
+ cleaned_name, EXISTING_PRODUCT_NAMES, scorer=fuzz.token_sort_ratio
254
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  except Exception as e:
256
+ log.warning(f"Fuzzy matching failed for '{cleaned_name}': {e}")
257
+
258
+ validated_name = best_match if score >= FUZZY_MATCH_THRESHOLD else cleaned_name
259
+
260
+ # HS Code matching with error handling
261
+ best_hs_desc, hs_code = None, None
262
+ if HS_CODE_DESCRIPTIONS:
263
+ try:
264
+ best_hs_desc, _ = process.extractOne(
265
+ validated_name, list(HS_CODE_DESCRIPTIONS.keys())
266
+ )
267
+ hs_code = HS_CODE_DESCRIPTIONS.get(best_hs_desc)
268
+ except Exception as e:
269
+ log.warning(f"HS code matching failed for '{validated_name}': {e}")
270
+
271
+ processed_entry = {
272
+ "raw_name": str(raw_name), "cleaned_name": validated_name, "hs_code": hs_code,
273
+ "primary_category": best_hs_desc or "N/A", "status": ""
274
+ }
275
+
276
+ try:
277
+ existing_product = Product.query.filter_by(name=validated_name).first()
278
+ if existing_product:
279
+ if hs_code and existing_product.hs_code != hs_code:
280
+ existing_product.hs_code = hs_code
281
+ existing_product.primary_category = best_hs_desc or "N/A"
282
+ db.session.commit()
283
+ results['updated'] += 1
284
+ processed_entry['status'] = 'Updated'
285
+ else:
286
+ results['skipped_duplicates'] += 1
287
+ processed_entry['status'] = 'Skipped (Duplicate)'
288
+ else:
289
+ new_product = Product(name=validated_name, hs_code=hs_code, primary_category=best_hs_desc or 'N/A')
290
+ db.session.add(new_product)
291
+ db.session.commit()
292
+ results['added'] += 1
293
+ processed_entry['status'] = 'Added'
294
+ results['processed_data'].append(processed_entry)
295
  except Exception as e:
296
+ db.session.rollback()
297
+ log.error(f"Database error for '{validated_name}': {e}")
298
+ results['errors'].append(f"DB Error on '{validated_name}': {e}")
299
+ processed_entry['status'] = 'Error'
300
+ results['processed_data'].append(processed_entry)
301
+ except Exception as e:
302
+ log.error(f"Error processing row {index}: {e}")
303
+ results['errors'].append(f"Row {index + 1} error: {str(e)}")
304
+ continue
305
+
306
  return results
307
 
308
  # ───────────────────────────────────────────────────────────────────────────────
 
317
  return jsonify({
318
  "ok": True,
319
  "message": "The Product Validation server is running.",
320
+ "database": "in-memory" if DB_PATH == ':memory:' else "persistent"
 
321
  })
322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  @app.post("/api/upload")
324
  def upload_products():
325
  if 'file' not in request.files:
326
  return jsonify({"ok": False, "error": "No file part in the request"}), 400
 
327
  file = request.files['file']
328
  if file.filename == '':
329
  return jsonify({"ok": False, "error": "No file selected"}), 400
 
333
  filename = secure_filename(file.filename)
334
  filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
335
  file.save(filepath)
 
336
  results = process_uploaded_file(filepath, filename)
337
 
338
  # Clean up uploaded file
339
  try:
340
  os.remove(filepath)
341
  except:
342
+ pass # Non-critical cleanup
343
 
344
+ return jsonify({"ok": True, "message": "File processed successfully", "results": results})
 
 
 
 
 
345
  except Exception as e:
346
+ log.error(f"Upload error: {e}")
347
+ return jsonify({"ok": False, "error": f"Upload failed: {str(e)}"}, 500)
 
 
 
348
 
349
+ return jsonify({"ok": False, "error": f"Invalid file type. Allowed types are: {', '.join(ALLOWED_EXTENSIONS)}"}), 400
 
 
 
350
 
351
  @app.get("/api/products")
352
  def get_products():
 
355
  all_products = Product.query.all()
356
  products_list = [product.to_dict() for product in all_products]
357
  log.info(f"Successfully retrieved {len(products_list)} products.")
358
+ return jsonify({"ok": True, "count": len(products_list), "products": products_list})
 
 
 
 
359
  except Exception as e:
360
  log.error(f"Could not retrieve products from database: {e}")
361
+ return jsonify({"ok": False, "error": f"Database error: {str(e)}"}, 500)
 
 
 
362
 
363
  # ───────────────────────────────────────────────────────────────────────────────
364
  # MAIN (Server Initialization)
365
  # ───────────────────────────────────────────────────────────────────────────────
366
 
367
  if __name__ == "__main__":
368
+ log.info("===== Application Startup =====")
 
 
 
 
369
 
370
+ # Robust database initialization
371
+ max_retries = 3
372
+ for attempt in range(max_retries):
373
+ try:
374
+ with app.app_context():
375
+ log.info(f"Initializing server (attempt {attempt + 1})...")
376
+ db.create_all()
377
+ log.info("Database tables created successfully")
378
+ break
379
+ except Exception as e:
380
+ log.error(f"Database initialization failed (attempt {attempt + 1}): {e}")
381
+ if attempt == max_retries - 1:
382
+ log.error("All database initialization attempts failed, trying in-memory fallback")
383
+ try:
384
+ app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///:memory:'
385
+ # Reinitialize the database connection
386
+ from flask_sqlalchemy import SQLAlchemy
387
+ db.init_app(app)
388
+ with app.app_context():
389
+ db.create_all()
390
+ log.warning("Using in-memory database - data will not persist")
391
+ break
392
+ except Exception as fallback_error:
393
+ log.error(f"Even in-memory database failed: {fallback_error}")
394
+ raise
395
+ else:
396
+ import time
397
+ time.sleep(1) # Brief pause between retries
398
+
399
  # Load supporting data (non-critical)
400
  try:
 
401
  HS_CODES_DATA = parse_hs_codes_pdf()
402
  EXISTING_PRODUCT_NAMES = load_existing_products()
403
  log.info("Supporting data loaded successfully")
404
  except Exception as e:
405
  log.warning(f"Failed to load supporting data: {e}")
406
  log.info("Server will continue with limited functionality")
407
+
408
+ log.info(f"Server is ready. Database: {DB_PATH}")
409
+
 
410
  port = int(os.environ.get("PORT", "7860"))
411
  app.run(host="0.0.0.0", port=port, debug=False)