File size: 12,147 Bytes
d1f57b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6eb56d5
9fcc741
 
6eb56d5
4d8c2a5
9fcc741
6eb56d5
d1f57b9
4d8c2a5
6eb56d5
d1f57b9
b6ed410
 
 
3e16bd6
d1f57b9
 
4d8c2a5
d1f57b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6ed410
d1f57b9
 
 
f27ad5a
 
 
a2c8a5f
 
 
 
d1f57b9
f27ad5a
 
 
 
 
 
 
 
a2c8a5f
f27ad5a
a2c8a5f
 
 
f27ad5a
a2c8a5f
f27ad5a
d1f57b9
f27ad5a
a2c8a5f
d1f57b9
6eb56d5
 
 
 
 
d1f57b9
6eb56d5
 
 
 
d1f57b9
6eb56d5
3e16bd6
d1f57b9
 
 
 
 
b6ed410
d1f57b9
 
b6ed410
 
d1f57b9
6eb56d5
 
3e16bd6
6eb56d5
5d5ed5e
6eb56d5
5d5ed5e
6eb56d5
 
 
 
3e16bd6
6eb56d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1f57b9
 
 
 
 
 
b6ed410
 
 
d1f57b9
 
6eb56d5
6affcb6
d1f57b9
 
 
 
 
 
 
 
6eb56d5
3e16bd6
 
 
 
6eb56d5
 
 
d1f57b9
 
 
 
 
6eb56d5
 
d1f57b9
6eb56d5
d1f57b9
6eb56d5
 
d1f57b9
 
 
 
 
 
6eb56d5
 
 
f27ad5a
d1f57b9
6eb56d5
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import os
import io
import logging
import re
import pandas as pd
from flask import Flask, request, jsonify
from flask_cors import CORS
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy.exc import IntegrityError
from thefuzz import process, fuzz
from werkzeug.utils import secure_filename

# ───────────────────────────────────────────────────────────────────────────────
# CONFIGURATION
# ───────────────────────────────────────────────────────────────────────────────

logging.basicConfig(level=logging.INFO)
log = logging.getLogger("product-pipeline-api")

app = Flask(__name__)
CORS(app)

# --- App Configuration ---
basedir = os.path.abspath(os.path.dirname(__file__))
DB_FOLDER = os.path.join(basedir, 'data')
DB_PATH = os.path.join(DB_FOLDER, 'products.db')
os.makedirs(DB_FOLDER, exist_ok=True) 

app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{DB_PATH}'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.config['UPLOAD_FOLDER'] = os.path.join(basedir, 'uploads')
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)

# --- File Upload Configuration ---
ALLOWED_EXTENSIONS = {'csv', 'xls', 'xlsx'}

db = SQLAlchemy(app)

# ───────────────────────────────────────────────────────────────────────────────
# DATABASE MODEL
# ───────────────────────────────────────────────────────────────────────────────

class Product(db.Model):
    """Represents the 'products' table."""
    __tablename__ = 'products'
    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String(255), nullable=False, unique=True)
    category_id = db.Column(db.Integer, nullable=False, default=1)
    primary_category = db.Column(db.String(255), nullable=False, default='N/A')
    hs_code = db.Column(db.String(255), nullable=True)

    def to_dict(self):
        """Serializes the Product object to a dictionary."""
        return {
            'id': self.id,
            'name': self.name,
            'category_id': self.category_id,
            'primary_category': self.primary_category,
            'hs_code': self.hs_code
        }

    def __repr__(self):
        return f'<Product {self.id}: {self.name}>'

# ───────────────────────────────────────────────────────────────────────────────
# DATA LOADING & PRE-PROCESSING
# ───────────────────────────────────────────────────────────────────────────────

FUZZY_MATCH_THRESHOLD = 85
EXISTING_PRODUCT_NAMES = []
HS_CODE_DESCRIPTIONS = {}

# --- FINAL FIX: Point to the correct .xlsx file and use pandas.read_excel ---
def load_hs_codes(filename="HS_Codes_for_use_under_FDMS.xlsx"):
    """Loads HS codes from the user-provided clean Excel file."""
    log.info(f"Loading HS Codes from '{filename}'...")
    if not os.path.exists(filename):
        log.error(f"HS Code file not found at '{filename}'. Categorization will fail.")
        return
    try:
        # Use read_excel for .xlsx files
        df = pd.read_excel(filename, engine='openpyxl')
        
        # Check for expected columns
        if 'HS CODE' not in df.columns or 'GOODS DESCRIPTION' not in df.columns:
            log.error("HS Code Excel file is missing 'HS CODE' or 'GOODS DESCRIPTION' columns.")
            return

        for _, row in df.iterrows():
            # Convert to string and strip whitespace to handle potential data issues
            code = str(row['HS CODE']).strip()
            desc = str(row['GOODS DESCRIPTION']).strip()
            if code and desc and code != 'nan' and desc != 'nan':
                # Ensure the code is treated as a string, preserving leading zeros if any
                HS_CODE_DESCRIPTIONS[desc] = code
        log.info(f"Successfully parsed {len(HS_CODE_DESCRIPTIONS)} HS codes from Excel file.")
    except Exception as e:
        log.error(f"Failed to load HS codes from Excel file: {e}")


def load_existing_products(filepath='Product List.csv'):
    log.info(f"Loading master product list from '{filepath}'...")
    if not os.path.exists(filepath):
        log.error(f"Master product list not found at '{filepath}'. Validation may be inaccurate.")
        return []
    try:
        df = pd.read_csv(filepath, usecols=[1], names=['name'], header=0)
        product_names = df['name'].dropna().unique().tolist()
        log.info(f"Loaded {len(product_names)} unique existing products.")
        return product_names
    except Exception as e:
        log.error(f"Failed to load master product list: {e}")
        return []

# ───────────────────────────────────────────────────────────────────────────────
# CORE PROCESSING PIPELINE
# ───────────────────────────────────────────────────────────────────────────────

def process_uploaded_file(filepath, filename):
    log.info(f"Starting processing for file: {filepath}")
    results = {
        "processed": 0, "added": 0, "updated": 0, "skipped_duplicates": 0,
        "errors": [], "processed_data": []
    }
    df = None

    try:
        file_ext = filename.rsplit('.', 1)[1].lower()
        if file_ext == 'csv':
            df = pd.read_csv(filepath, header=None, usecols=[1], names=['product_name'])
        elif file_ext in ['xls', 'xlsx']:
            df = pd.read_excel(filepath, header=None, usecols=[1], names=['product_name'], engine='openpyxl')
    except ValueError:
         results['errors'].append("Could not find the product name column. Ensure the product name is in the second column.")
         return results
    except Exception as e:
        log.error(f"Could not read the uploaded file: {e}")
        results['errors'].append(f"Invalid file format or corrupt file: {e}")
        return results

    if df.empty:
        results['errors'].append("The uploaded file is empty.")
        return results

    for index, row in df.iterrows():
        raw_name = row['product_name']
        results['processed'] += 1

        if not isinstance(raw_name, str) or not raw_name.strip():
            continue

        cleaned_name = raw_name.strip()

        best_match, score = process.extractOne(
            cleaned_name, EXISTING_PRODUCT_NAMES, scorer=fuzz.token_sort_ratio
        ) if EXISTING_PRODUCT_NAMES else (cleaned_name, 100)
        validated_name = best_match if score >= FUZZY_MATCH_THRESHOLD else cleaned_name

        best_hs_desc, _ = process.extractOne(
            validated_name, HS_CODE_DESCRIPTIONS.keys()
        ) if HS_CODE_DESCRIPTIONS else (None, 0)
        hs_code = HS_CODE_DESCRIPTIONS.get(best_hs_desc)

        processed_entry = {
            "raw_name": raw_name, "cleaned_name": validated_name, "hs_code": hs_code,
            "primary_category": best_hs_desc or "N/A", "status": ""
        }
        try:
            with app.app_context():
                existing_product = Product.query.filter_by(name=validated_name).first()
                if existing_product:
                    if hs_code and existing_product.hs_code != hs_code:
                        existing_product.hs_code = hs_code
                        existing_product.primary_category = best_hs_desc
                        db.session.commit()
                        results['updated'] += 1
                        processed_entry['status'] = 'Updated'
                    else:
                        results['skipped_duplicates'] += 1
                        processed_entry['status'] = 'Skipped (Duplicate)'
                else:
                    new_product = Product(name=validated_name, hs_code=hs_code, primary_category=best_hs_desc or 'N/A')
                    db.session.add(new_product)
                    db.session.commit()
                    results['added'] += 1
                    processed_entry['status'] = 'Added'
                results['processed_data'].append(processed_entry)
        except Exception as e:
            db.session.rollback()
            log.error(f"Database error for '{validated_name}': {e}")
            results['errors'].append(f"DB Error on '{validated_name}': {e}")
    return results

# ───────────────────────────────────────────────────────────────────────────────
# ROUTES
# ───────────────────────────────────────────────────────────────────────────────

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

@app.get("/")
def root():
    return jsonify({"ok": True, "message": "The Product Validation server is running."})

@app.post("/api/upload")
def upload_products():
    if 'file' not in request.files:
        return jsonify({"ok": False, "error": "No file part in the request"}), 400
    file = request.files['file']
    if file.filename == '':
        return jsonify({"ok": False, "error": "No file selected"}), 400

    if file and allowed_file(file.filename):
        filename = secure_filename(file.filename)
        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        file.save(filepath)
        results = process_uploaded_file(filepath, filename)
        return jsonify({"ok": True, "message": "File processed successfully", "results": results})

    return jsonify({"ok": False, "error": f"Invalid file type. Allowed types are: {', '.join(ALLOWED_EXTENSIONS)}"}), 400

@app.get("/api/products")
def get_products():
    log.info("Request received to fetch all products.")
    try:
        all_products = Product.query.all()
        products_list = [product.to_dict() for product in all_products]
        log.info(f"Successfully retrieved {len(products_list)} products.")
        return jsonify({"ok": True, "count": len(products_list), "products": products_list})
    except Exception as e:
        log.error(f"Could not retrieve products from database: {e}")
        return jsonify({"ok": False, "error": "Failed to retrieve products from the database."}), 500

# ───────────────────────────────────────────────────────────────────────────────
# MAIN (Server Initialization)
# ───────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    with app.app_context():
        log.info("Initializing server...")
        db.create_all()
        load_hs_codes() 
        EXISTING_PRODUCT_NAMES = load_existing_products()
        log.info(f"Server is ready. Database is at: {DB_PATH}")

    port = int(os.environ.get("PORT", "7860"))
    app.run(host="0.0.0.0", port=port, debug=False)