rairo commited on
Commit
d1f57b9
·
verified ·
1 Parent(s): 35e9aaf

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +290 -0
main.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import logging
4
+ import re
5
+ import pandas as pd
6
+ import pdfplumber
7
+ from flask import Flask, request, jsonify
8
+ from flask_cors import CORS
9
+ from flask_sqlalchemy import SQLAlchemy
10
+ from sqlalchemy.exc import IntegrityError
11
+ from thefuzz import process, fuzz
12
+ from werkzeug.utils import secure_filename
13
+
14
+ # ───────────────────────────────────────────────────────────────────────────────
15
+ # CONFIGURATION
16
+ # ───────────────────────────────────────────────────────────────────────────────
17
+
18
+ logging.basicConfig(level=logging.INFO)
19
+ log = logging.getLogger("product-pipeline-api")
20
+
21
+ app = Flask(__name__)
22
+ CORS(app)
23
+
24
+ # --- Database Configuration (Mocking MySQL with SQLite) ---
25
+ # Use an in-memory SQLite database for simplicity and portability.
26
+ # This mimics the real database without requiring a MySQL server for development.
27
+ app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///:memory:'
28
+ app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
29
+ app.config['UPLOAD_FOLDER'] = 'uploads'
30
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
31
+
32
+ db = SQLAlchemy(app)
33
+
34
+ # ───────────────────────────────────────────────────────────────────────────────
35
+ # DATABASE MODEL (Based on products-20.sql)
36
+ # ───────────────────────────────────────────────────────────────────────────────
37
+
38
+ class Product(db.Model):
39
+ """Represents the 'products' table."""
40
+ __tablename__ = 'products'
41
+ id = db.Column(db.Integer, primary_key=True)
42
+ name = db.Column(db.String(255), nullable=False, unique=True)
43
+ category_id = db.Column(db.Integer, nullable=False, default=1)
44
+ primary_category = db.Column(db.String(255), nullable=False, default='N/A')
45
+ hs_code = db.Column(db.String(255), nullable=True)
46
+
47
+ def to_dict(self):
48
+ """Serializes the Product object to a dictionary."""
49
+ return {
50
+ 'id': self.id,
51
+ 'name': self.name,
52
+ 'category_id': self.category_id,
53
+ 'primary_category': self.primary_category,
54
+ 'hs_code': self.hs_code
55
+ }
56
+
57
+ def __repr__(self):
58
+ return f'<Product {self.id}: {self.name}>'
59
+
60
+ # ───────────────────────────────────────────────────────────────────────────────
61
+ # DATA LOADING & PRE-PROCESSING
62
+ # ───────────────────────────────────────────────────────────────────────────────
63
+
64
+ # --- Constants for Validation Logic ---
65
+ FUZZY_MATCH_THRESHOLD = 85 # Similarity score (out of 100) to consider a match
66
+
67
+ # --- In-memory cache for essential data ---
68
+ HS_CODES_DATA = []
69
+ EXISTING_PRODUCT_NAMES = []
70
+ HS_CODE_DESCRIPTIONS = {}
71
+
72
+ def parse_hs_codes_pdf(filepath='HS Codes for use under FDMS.pdf'):
73
+ """Extracts HS codes and descriptions from the PDF file."""
74
+ log.info(f"Parsing HS Codes from '{filepath}'...")
75
+ if not os.path.exists(filepath):
76
+ log.error(f"HS Code PDF not found at '{filepath}'. Categorization will fail.")
77
+ return []
78
+
79
+ codes = []
80
+ try:
81
+ with pdfplumber.open(filepath) as pdf:
82
+ for page in pdf.pages:
83
+ text = page.extract_text()
84
+ # Regex to find HS codes and their descriptions.
85
+ # It looks for a pattern of numbers (code) followed by text (description).
86
+ matches = re.findall(r'\"(\d+)\n\"\,?\"(.*?)\n\"', text, re.DOTALL)
87
+ for code, desc in matches:
88
+ clean_desc = desc.replace('\n', ' ').strip()
89
+ if code and clean_desc:
90
+ codes.append({'code': code, 'description': clean_desc})
91
+ HS_CODE_DESCRIPTIONS[clean_desc] = code
92
+ except Exception as e:
93
+ log.error(f"Failed to parse PDF: {e}")
94
+ log.info(f"Successfully parsed {len(codes)} HS codes.")
95
+ return codes
96
+
97
+ def load_existing_products(filepath='Product List.csv'):
98
+ """Loads the master product list for validation."""
99
+ log.info(f"Loading master product list from '{filepath}'...")
100
+ if not os.path.exists(filepath):
101
+ log.error(f"Master product list not found at '{filepath}'. Validation may be inaccurate.")
102
+ return []
103
+
104
+ try:
105
+ df = pd.read_csv(filepath)
106
+ # Drop duplicates to ensure a clean list for matching
107
+ product_names = df['name'].dropna().unique().tolist()
108
+ log.info(f"Loaded {len(product_names)} unique existing products.")
109
+ return product_names
110
+ except Exception as e:
111
+ log.error(f"Failed to load master product list: {e}")
112
+ return []
113
+
114
+ # ───────────────────────────────────────────────────────────────────────────────
115
+ # CORE PROCESSING PIPELINE
116
+ # ───────────────────────────────────────────────────────────────────────────────
117
+
118
+ def process_uploaded_file(filepath):
119
+ """
120
+ The main pipeline to validate, clean, categorize, and store product data.
121
+ """
122
+ log.info(f"Starting processing for file: {filepath}")
123
+ results = {
124
+ "processed": 0,
125
+ "added": 0,
126
+ "updated": 0,
127
+ "skipped_duplicates": 0,
128
+ "errors": [],
129
+ "processed_data": []
130
+ }
131
+
132
+ try:
133
+ # The uploaded file might not have headers, so we name the column
134
+ df = pd.read_csv(filepath, header=None, names=['product_name_raw'])
135
+ except Exception as e:
136
+ log.error(f"Could not read CSV: {e}")
137
+ results['errors'].append(f"Invalid CSV format: {e}")
138
+ return results
139
+
140
+ # Extract the column with product names, even if its index is not 0
141
+ product_name_col = None
142
+ for col in df.columns:
143
+ # Heuristic: find the column that seems to contain product names (string type)
144
+ if df[col].dtype == 'object' and df[col].astype(str).str.contains('[a-zA-Z]').any():
145
+ product_name_col = col
146
+ break
147
+
148
+ if product_name_col is None:
149
+ results['errors'].append("Could not find a column with product names in the uploaded CSV.")
150
+ return results
151
+
152
+ for index, row in df.iterrows():
153
+ raw_name = row[product_name_col]
154
+ results['processed'] += 1
155
+
156
+ if not isinstance(raw_name, str) or not raw_name.strip():
157
+ continue # Skip empty rows
158
+
159
+ # --- 1. Validation & Cleaning using Fuzzy Matching ---
160
+ # Find the closest match from the master product list
161
+ best_match, score = process.extractOne(
162
+ raw_name, EXISTING_PRODUCT_NAMES, scorer=fuzz.token_sort_ratio
163
+ ) if EXISTING_PRODUCT_NAMES else (raw_name, 100)
164
+
165
+ cleaned_name = best_match if score >= FUZZY_MATCH_THRESHOLD else raw_name
166
+ log.info(f"'{raw_name}' -> '{cleaned_name}' (Score: {score})")
167
+
168
+ # --- 2. HS Code Categorization ---
169
+ # Find the best HS code description match for the cleaned name
170
+ best_hs_desc, _ = process.extractOne(
171
+ cleaned_name, HS_CODE_DESCRIPTIONS.keys()
172
+ ) if HS_CODE_DESCRIPTIONS else (None, 0)
173
+
174
+ hs_code = HS_CODE_DESCRIPTIONS.get(best_hs_desc)
175
+ log.info(f"Assigned HS Code: {hs_code} (Based on: '{best_hs_desc}')")
176
+
177
+ # --- 3. Database Operation ---
178
+ processed_entry = {
179
+ "raw_name": raw_name,
180
+ "cleaned_name": cleaned_name,
181
+ "hs_code": hs_code,
182
+ "primary_category": best_hs_desc or "N/A",
183
+ "status": ""
184
+ }
185
+
186
+ try:
187
+ # Check if a product with this cleaned name already exists
188
+ existing_product = Product.query.filter_by(name=cleaned_name).first()
189
+
190
+ if existing_product:
191
+ # Update existing product if HS code is new
192
+ if hs_code and existing_product.hs_code != hs_code:
193
+ existing_product.hs_code = hs_code
194
+ existing_product.primary_category = best_hs_desc
195
+ db.session.commit()
196
+ results['updated'] += 1
197
+ processed_entry['status'] = 'Updated'
198
+ else:
199
+ results['skipped_duplicates'] += 1
200
+ processed_entry['status'] = 'Skipped (Duplicate)'
201
+ else:
202
+ # Add new product
203
+ new_product = Product(
204
+ name=cleaned_name,
205
+ hs_code=hs_code,
206
+ primary_category=best_hs_desc or 'N/A'
207
+ )
208
+ db.session.add(new_product)
209
+ db.session.commit()
210
+ results['added'] += 1
211
+ processed_entry['status'] = 'Added'
212
+
213
+ results['processed_data'].append(processed_entry)
214
+
215
+ except IntegrityError:
216
+ db.session.rollback()
217
+ log.warning(f"Integrity error for '{cleaned_name}', likely a race condition. Skipping.")
218
+ results['skipped_duplicates'] += 1
219
+ except Exception as e:
220
+ db.session.rollback()
221
+ log.error(f"Database error for '{cleaned_name}': {e}")
222
+ results['errors'].append(f"DB Error on '{cleaned_name}': {e}")
223
+
224
+ return results
225
+
226
+
227
+ # ───────────────────────────────────────────────────────────────────────────────
228
+ # ROUTES
229
+ # ───────────────────────────────────────────────────────────────────────────────
230
+
231
+ @app.get("/")
232
+ def root():
233
+ return jsonify({"ok": True, "message": "The Product Validation server is running."})
234
+
235
+
236
+ @app.post("/api/upload")
237
+ def upload_products():
238
+ """Endpoint to upload and process a product CSV file."""
239
+ if 'file' not in request.files:
240
+ return jsonify({"ok": False, "error": "No file part in the request"}), 400
241
+
242
+ file = request.files['file']
243
+ if file.filename == '':
244
+ return jsonify({"ok": False, "error": "No file selected"}), 400
245
+
246
+ if file and file.filename.endswith('.csv'):
247
+ filename = secure_filename(file.filename)
248
+ filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
249
+ file.save(filepath)
250
+
251
+ results = process_uploaded_file(filepath)
252
+
253
+ return jsonify({"ok": True, "message": "File processed successfully", "results": results})
254
+
255
+ return jsonify({"ok": False, "error": "Invalid file type. Please upload a CSV."}), 400
256
+
257
+
258
+ @app.get("/api/products")
259
+ def get_products():
260
+ """Endpoint to retrieve all processed products from the database."""
261
+ log.info("Request received to fetch all products.")
262
+ try:
263
+ all_products = Product.query.all()
264
+ # Use the to_dict() method to serialize each product object
265
+ products_list = [product.to_dict() for product in all_products]
266
+ log.info(f"Successfully retrieved {len(products_list)} products.")
267
+ return jsonify({"ok": True, "count": len(products_list), "products": products_list})
268
+ except Exception as e:
269
+ log.error(f"Could not retrieve products from database: {e}")
270
+ return jsonify({"ok": False, "error": "Failed to retrieve products from the database."}), 500
271
+
272
+
273
+ # ───────────────────────────────────────────────────────────────────────────────
274
+ # MAIN (Server Initialization)
275
+ # ───────────────────────────────────────────────────────────────────────────────
276
+
277
+ if __name__ == "__main__":
278
+ with app.app_context():
279
+ log.info("Initializing server...")
280
+ # Create database tables based on the model
281
+ db.create_all()
282
+
283
+ # Load validation data into memory
284
+ HS_CODES_DATA = parse_hs_codes_pdf()
285
+ EXISTING_PRODUCT_NAMES = load_existing_products()
286
+ log.info("Server is ready and validation data is loaded.")
287
+
288
+ port = int(os.environ.get("PORT", "7860"))
289
+ app.run(host="0.0.0.0", port=port, debug=False)
290
+