rairo commited on
Commit
b6ed410
·
verified ·
1 Parent(s): daddd99

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +72 -113
main.py CHANGED
@@ -21,14 +21,20 @@ log = logging.getLogger("product-pipeline-api")
21
  app = Flask(__name__)
22
  CORS(app)
23
 
24
- # --- Database Configuration (Mocking MySQL with SQLite) ---
25
- # Use an in-memory SQLite database for simplicity and portability.
26
- # This mimics the real database without requiring a MySQL server for development.
27
- app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///:memory:'
 
 
 
28
  app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
29
  app.config['UPLOAD_FOLDER'] = 'uploads'
30
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
31
 
 
 
 
32
  db = SQLAlchemy(app)
33
 
34
  # ───────────────────────────────────────────────────────────────────────────────
@@ -61,29 +67,23 @@ class Product(db.Model):
61
  # DATA LOADING & PRE-PROCESSING
62
  # ───────────────────────────────────────────────────────────────────────────────
63
 
64
- # --- Constants for Validation Logic ---
65
- FUZZY_MATCH_THRESHOLD = 85 # Similarity score (out of 100) to consider a match
66
-
67
- # --- In-memory cache for essential data ---
68
  HS_CODES_DATA = []
69
  EXISTING_PRODUCT_NAMES = []
70
  HS_CODE_DESCRIPTIONS = {}
71
 
72
  def parse_hs_codes_pdf(filepath='HS Codes for use under FDMS.pdf'):
73
- """Extracts HS codes and descriptions from the PDF file."""
74
  log.info(f"Parsing HS Codes from '{filepath}'...")
75
  if not os.path.exists(filepath):
76
  log.error(f"HS Code PDF not found at '{filepath}'. Categorization will fail.")
77
  return []
78
-
79
  codes = []
80
  try:
81
  with pdfplumber.open(filepath) as pdf:
82
  for page in pdf.pages:
83
  text = page.extract_text()
84
- # Regex to find HS codes and their descriptions.
85
- # It looks for a pattern of numbers (code) followed by text (description).
86
- matches = re.findall(r'\"(\d+)\n\"\,?\"(.*?)\n\"', text, re.DOTALL)
87
  for code, desc in matches:
88
  clean_desc = desc.replace('\n', ' ').strip()
89
  if code and clean_desc:
@@ -95,15 +95,13 @@ def parse_hs_codes_pdf(filepath='HS Codes for use under FDMS.pdf'):
95
  return codes
96
 
97
  def load_existing_products(filepath='Product List.csv'):
98
- """Loads the master product list for validation."""
99
  log.info(f"Loading master product list from '{filepath}'...")
100
  if not os.path.exists(filepath):
101
  log.error(f"Master product list not found at '{filepath}'. Validation may be inaccurate.")
102
  return []
103
-
104
  try:
105
- df = pd.read_csv(filepath)
106
- # Drop duplicates to ensure a clean list for matching
107
  product_names = df['name'].dropna().unique().tolist()
108
  log.info(f"Loaded {len(product_names)} unique existing products.")
109
  return product_names
@@ -115,153 +113,118 @@ def load_existing_products(filepath='Product List.csv'):
115
  # CORE PROCESSING PIPELINE
116
  # ───────────────────────────────────────────────────────────────────────────────
117
 
118
- def process_uploaded_file(filepath):
119
- """
120
- The main pipeline to validate, clean, categorize, and store product data.
121
- """
122
  log.info(f"Starting processing for file: {filepath}")
123
  results = {
124
- "processed": 0,
125
- "added": 0,
126
- "updated": 0,
127
- "skipped_duplicates": 0,
128
- "errors": [],
129
- "processed_data": []
130
  }
 
131
 
132
  try:
133
- # The uploaded file might not have headers, so we name the column
134
- df = pd.read_csv(filepath, header=None, names=['product_name_raw'])
 
 
 
 
 
 
 
 
135
  except Exception as e:
136
- log.error(f"Could not read CSV: {e}")
137
- results['errors'].append(f"Invalid CSV format: {e}")
138
  return results
139
 
140
- # Extract the column with product names, even if its index is not 0
141
- product_name_col = None
142
- for col in df.columns:
143
- # Heuristic: find the column that seems to contain product names (string type)
144
- if df[col].dtype == 'object' and df[col].astype(str).str.contains('[a-zA-Z]').any():
145
- product_name_col = col
146
- break
147
-
148
- if product_name_col is None:
149
- results['errors'].append("Could not find a column with product names in the uploaded CSV.")
150
  return results
151
 
152
  for index, row in df.iterrows():
153
- raw_name = row[product_name_col]
154
  results['processed'] += 1
155
 
156
  if not isinstance(raw_name, str) or not raw_name.strip():
157
- continue # Skip empty rows
158
 
159
- # --- 1. Validation & Cleaning using Fuzzy Matching ---
160
- # Find the closest match from the master product list
161
- best_match, score = process.extractOne(
162
- raw_name, EXISTING_PRODUCT_NAMES, scorer=fuzz.token_sort_ratio
163
- ) if EXISTING_PRODUCT_NAMES else (raw_name, 100)
164
 
165
- cleaned_name = best_match if score >= FUZZY_MATCH_THRESHOLD else raw_name
166
- log.info(f"'{raw_name}' -> '{cleaned_name}' (Score: {score})")
 
 
167
 
168
- # --- 2. HS Code Categorization ---
169
- # Find the best HS code description match for the cleaned name
170
  best_hs_desc, _ = process.extractOne(
171
- cleaned_name, HS_CODE_DESCRIPTIONS.keys()
172
  ) if HS_CODE_DESCRIPTIONS else (None, 0)
173
-
174
  hs_code = HS_CODE_DESCRIPTIONS.get(best_hs_desc)
175
- log.info(f"Assigned HS Code: {hs_code} (Based on: '{best_hs_desc}')")
176
 
177
- # --- 3. Database Operation ---
178
  processed_entry = {
179
- "raw_name": raw_name,
180
- "cleaned_name": cleaned_name,
181
- "hs_code": hs_code,
182
- "primary_category": best_hs_desc or "N/A",
183
- "status": ""
184
  }
185
-
186
  try:
187
- # Check if a product with this cleaned name already exists
188
- existing_product = Product.query.filter_by(name=cleaned_name).first()
189
-
190
- if existing_product:
191
- # Update existing product if HS code is new
192
- if hs_code and existing_product.hs_code != hs_code:
193
- existing_product.hs_code = hs_code
194
- existing_product.primary_category = best_hs_desc
195
- db.session.commit()
196
- results['updated'] += 1
197
- processed_entry['status'] = 'Updated'
 
 
198
  else:
199
- results['skipped_duplicates'] += 1
200
- processed_entry['status'] = 'Skipped (Duplicate)'
201
- else:
202
- # Add new product
203
- new_product = Product(
204
- name=cleaned_name,
205
- hs_code=hs_code,
206
- primary_category=best_hs_desc or 'N/A'
207
- )
208
- db.session.add(new_product)
209
- db.session.commit()
210
- results['added'] += 1
211
- processed_entry['status'] = 'Added'
212
-
213
- results['processed_data'].append(processed_entry)
214
-
215
- except IntegrityError:
216
- db.session.rollback()
217
- log.warning(f"Integrity error for '{cleaned_name}', likely a race condition. Skipping.")
218
- results['skipped_duplicates'] += 1
219
  except Exception as e:
220
  db.session.rollback()
221
- log.error(f"Database error for '{cleaned_name}': {e}")
222
- results['errors'].append(f"DB Error on '{cleaned_name}': {e}")
223
-
224
  return results
225
 
226
-
227
  # ───────────────────────────────────────────────────────────────────────────────
228
  # ROUTES
229
  # ───────────────────────────────────────────────────────────────────────────────
230
 
 
 
 
231
  @app.get("/")
232
  def root():
233
  return jsonify({"ok": True, "message": "The Product Validation server is running."})
234
 
235
-
236
  @app.post("/api/upload")
237
  def upload_products():
238
- """Endpoint to upload and process a product CSV file."""
239
  if 'file' not in request.files:
240
  return jsonify({"ok": False, "error": "No file part in the request"}), 400
241
-
242
  file = request.files['file']
243
  if file.filename == '':
244
  return jsonify({"ok": False, "error": "No file selected"}), 400
245
 
246
- if file and file.filename.endswith('.csv'):
247
  filename = secure_filename(file.filename)
248
  filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
249
  file.save(filepath)
250
-
251
- results = process_uploaded_file(filepath)
252
-
253
  return jsonify({"ok": True, "message": "File processed successfully", "results": results})
254
 
255
- return jsonify({"ok": False, "error": "Invalid file type. Please upload a CSV."}), 400
256
-
257
 
258
  @app.get("/api/products")
259
  def get_products():
260
- """Endpoint to retrieve all processed products from the database."""
261
  log.info("Request received to fetch all products.")
262
  try:
263
  all_products = Product.query.all()
264
- # Use the to_dict() method to serialize each product object
265
  products_list = [product.to_dict() for product in all_products]
266
  log.info(f"Successfully retrieved {len(products_list)} products.")
267
  return jsonify({"ok": True, "count": len(products_list), "products": products_list})
@@ -269,7 +232,6 @@ def get_products():
269
  log.error(f"Could not retrieve products from database: {e}")
270
  return jsonify({"ok": False, "error": "Failed to retrieve products from the database."}), 500
271
 
272
-
273
  # ───────────────────────────────────────────────────────────────────────────────
274
  # MAIN (Server Initialization)
275
  # ───────────────────────────────────────────────────────────────────────────────
@@ -277,13 +239,10 @@ def get_products():
277
  if __name__ == "__main__":
278
  with app.app_context():
279
  log.info("Initializing server...")
280
- # Create database tables based on the model
281
  db.create_all()
282
-
283
- # Load validation data into memory
284
  HS_CODES_DATA = parse_hs_codes_pdf()
285
  EXISTING_PRODUCT_NAMES = load_existing_products()
286
- log.info("Server is ready and validation data is loaded.")
287
 
288
  port = int(os.environ.get("PORT", "7860"))
289
  app.run(host="0.0.0.0", port=port, debug=False)
 
21
  app = Flask(__name__)
22
  CORS(app)
23
 
24
+ # --- App Configuration ---
25
+ # --- FIX 1: Switched to a persistent file-based SQLite database ---
26
+ # This ensures data survives between requests on Hugging Face Spaces.
27
+ DB_FOLDER = 'data'
28
+ DB_PATH = os.path.join(DB_FOLDER, 'products.db')
29
+ os.makedirs(DB_FOLDER, exist_ok=True)
30
+ app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{DB_PATH}'
31
  app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
32
  app.config['UPLOAD_FOLDER'] = 'uploads'
33
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
34
 
35
+ # --- File Upload Configuration ---
36
+ ALLOWED_EXTENSIONS = {'csv', 'xls', 'xlsx'}
37
+
38
  db = SQLAlchemy(app)
39
 
40
  # ───────────────────────────────────────────────────────────────────────────────
 
67
  # DATA LOADING & PRE-PROCESSING
68
  # ───────────────────────────────────────────────────────────────────────────────
69
 
70
+ FUZZY_MATCH_THRESHOLD = 85
 
 
 
71
  HS_CODES_DATA = []
72
  EXISTING_PRODUCT_NAMES = []
73
  HS_CODE_DESCRIPTIONS = {}
74
 
75
  def parse_hs_codes_pdf(filepath='HS Codes for use under FDMS.pdf'):
 
76
  log.info(f"Parsing HS Codes from '{filepath}'...")
77
  if not os.path.exists(filepath):
78
  log.error(f"HS Code PDF not found at '{filepath}'. Categorization will fail.")
79
  return []
 
80
  codes = []
81
  try:
82
  with pdfplumber.open(filepath) as pdf:
83
  for page in pdf.pages:
84
  text = page.extract_text()
85
+ # Improved regex to handle variations in PDF formatting
86
+ matches = re.findall(r'\"(\d{8})\"\s*,\s*\"(.*?)\"', text, re.DOTALL)
 
87
  for code, desc in matches:
88
  clean_desc = desc.replace('\n', ' ').strip()
89
  if code and clean_desc:
 
95
  return codes
96
 
97
  def load_existing_products(filepath='Product List.csv'):
 
98
  log.info(f"Loading master product list from '{filepath}'...")
99
  if not os.path.exists(filepath):
100
  log.error(f"Master product list not found at '{filepath}'. Validation may be inaccurate.")
101
  return []
 
102
  try:
103
+ # Based on the CSV structure, the 'name' is in the second column.
104
+ df = pd.read_csv(filepath, usecols=[1], names=['name'], header=0)
105
  product_names = df['name'].dropna().unique().tolist()
106
  log.info(f"Loaded {len(product_names)} unique existing products.")
107
  return product_names
 
113
  # CORE PROCESSING PIPELINE
114
  # ───────────────────────────────────────────────────────────────────────────────
115
 
116
+ def process_uploaded_file(filepath, filename):
117
+ """The main pipeline to validate, clean, categorize, and store product data."""
 
 
118
  log.info(f"Starting processing for file: {filepath}")
119
  results = {
120
+ "processed": 0, "added": 0, "updated": 0, "skipped_duplicates": 0,
121
+ "errors": [], "processed_data": []
 
 
 
 
122
  }
123
+ df = None
124
 
125
  try:
126
+ file_ext = filename.rsplit('.', 1)[1].lower()
127
+ # --- FIX 2: Robustly parse the second column (index 1) for names ---
128
+ # The user's uploaded `list.csv` clearly has the product name in the second column.
129
+ if file_ext == 'csv':
130
+ df = pd.read_csv(filepath, header=None, usecols=[1], names=['product_name'])
131
+ elif file_ext in ['xls', 'xlsx']:
132
+ df = pd.read_excel(filepath, header=None, usecols=[1], names=['product_name'], engine='openpyxl')
133
+ except ValueError:
134
+ results['errors'].append("Could not find the product name column. Ensure the product name is in the second column.")
135
+ return results
136
  except Exception as e:
137
+ log.error(f"Could not read the uploaded file: {e}")
138
+ results['errors'].append(f"Invalid file format or corrupt file: {e}")
139
  return results
140
 
141
+ if df.empty:
142
+ results['errors'].append("The uploaded file is empty.")
 
 
 
 
 
 
 
 
143
  return results
144
 
145
  for index, row in df.iterrows():
146
+ raw_name = row['product_name']
147
  results['processed'] += 1
148
 
149
  if not isinstance(raw_name, str) or not raw_name.strip():
150
+ continue
151
 
152
+ cleaned_name = raw_name.strip()
 
 
 
 
153
 
154
+ best_match, score = process.extractOne(
155
+ cleaned_name, EXISTING_PRODUCT_NAMES, scorer=fuzz.token_sort_ratio
156
+ ) if EXISTING_PRODUCT_NAMES else (cleaned_name, 100)
157
+ validated_name = best_match if score >= FUZZY_MATCH_THRESHOLD else cleaned_name
158
 
 
 
159
  best_hs_desc, _ = process.extractOne(
160
+ validated_name, HS_CODE_DESCRIPTIONS.keys()
161
  ) if HS_CODE_DESCRIPTIONS else (None, 0)
 
162
  hs_code = HS_CODE_DESCRIPTIONS.get(best_hs_desc)
 
163
 
 
164
  processed_entry = {
165
+ "raw_name": raw_name, "cleaned_name": validated_name, "hs_code": hs_code,
166
+ "primary_category": best_hs_desc or "N/A", "status": ""
 
 
 
167
  }
 
168
  try:
169
+ # Each operation needs its own app context to interact with the database
170
+ with app.app_context():
171
+ existing_product = Product.query.filter_by(name=validated_name).first()
172
+ if existing_product:
173
+ if hs_code and existing_product.hs_code != hs_code:
174
+ existing_product.hs_code = hs_code
175
+ existing_product.primary_category = best_hs_desc
176
+ db.session.commit()
177
+ results['updated'] += 1
178
+ processed_entry['status'] = 'Updated'
179
+ else:
180
+ results['skipped_duplicates'] += 1
181
+ processed_entry['status'] = 'Skipped (Duplicate)'
182
  else:
183
+ new_product = Product(name=validated_name, hs_code=hs_code, primary_category=best_hs_desc or 'N/A')
184
+ db.session.add(new_product)
185
+ db.session.commit()
186
+ results['added'] += 1
187
+ processed_entry['status'] = 'Added'
188
+ results['processed_data'].append(processed_entry)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  except Exception as e:
190
  db.session.rollback()
191
+ log.error(f"Database error for '{validated_name}': {e}")
192
+ results['errors'].append(f"DB Error on '{validated_name}': {e}")
 
193
  return results
194
 
 
195
  # ───────────────────────────────────────────────────────────────────────────────
196
  # ROUTES
197
  # ───────────────────────────────────────────────────────────────────────────────
198
 
199
+ def allowed_file(filename):
200
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
201
+
202
  @app.get("/")
203
  def root():
204
  return jsonify({"ok": True, "message": "The Product Validation server is running."})
205
 
 
206
  @app.post("/api/upload")
207
  def upload_products():
 
208
  if 'file' not in request.files:
209
  return jsonify({"ok": False, "error": "No file part in the request"}), 400
 
210
  file = request.files['file']
211
  if file.filename == '':
212
  return jsonify({"ok": False, "error": "No file selected"}), 400
213
 
214
+ if file and allowed_file(file.filename):
215
  filename = secure_filename(file.filename)
216
  filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
217
  file.save(filepath)
218
+ results = process_uploaded_file(filepath, filename)
 
 
219
  return jsonify({"ok": True, "message": "File processed successfully", "results": results})
220
 
221
+ return jsonify({"ok": False, "error": f"Invalid file type. Allowed types are: {', '.join(ALLOWED_EXTENSIONS)}"}), 400
 
222
 
223
  @app.get("/api/products")
224
  def get_products():
 
225
  log.info("Request received to fetch all products.")
226
  try:
227
  all_products = Product.query.all()
 
228
  products_list = [product.to_dict() for product in all_products]
229
  log.info(f"Successfully retrieved {len(products_list)} products.")
230
  return jsonify({"ok": True, "count": len(products_list), "products": products_list})
 
232
  log.error(f"Could not retrieve products from database: {e}")
233
  return jsonify({"ok": False, "error": "Failed to retrieve products from the database."}), 500
234
 
 
235
  # ───────────────────────────────────────────────────────────────────────────────
236
  # MAIN (Server Initialization)
237
  # ───────────────────────────────────────────────────────────────────────────────
 
239
  if __name__ == "__main__":
240
  with app.app_context():
241
  log.info("Initializing server...")
 
242
  db.create_all()
 
 
243
  HS_CODES_DATA = parse_hs_codes_pdf()
244
  EXISTING_PRODUCT_NAMES = load_existing_products()
245
+ log.info(f"Server is ready. Database is at: {DB_PATH}")
246
 
247
  port = int(os.environ.get("PORT", "7860"))
248
  app.run(host="0.0.0.0", port=port, debug=False)