from flask import Flask, request, jsonify from flask_cors import CORS import joblib import pandas as pd import numpy as np from feature_extractor_web import extract_features_web import logging import os BASE_DIR = os.path.dirname(os.path.abspath(__file__)) MODELS_DIR = os.path.join(BASE_DIR, "models") app = Flask(__name__) default_origins = "https://alalayfe.vercel.app,https://www.alalayfe.vercel.app,http://localhost:3000" ALLOWED_ORIGINS = [origin.strip().rstrip("/") for origin in os.getenv('ALLOWED_ORIGINS', default_origins).split(',') if origin.strip()] CORS(app, origins=ALLOWED_ORIGINS) logger = logging.getLogger(__name__) @app.route('/') def home(): return jsonify({ 'service': 'Alalay Readability API', 'status': 'running', 'endpoints': [ '/health', '/api/predict', '/api/predict/batch' ] }), 200 # Load all the saved components print("Loading model components...") try: model = joblib.load(os.path.join(MODELS_DIR, "readability_model.pkl")) label_encoder = joblib.load(os.path.join(MODELS_DIR, "label_encoder.pkl")) grade_mapping = joblib.load(os.path.join(MODELS_DIR, "grade_mapping.pkl")) thresholds = joblib.load(os.path.join(MODELS_DIR, "thresholds.pkl")) feature_info = joblib.load(os.path.join(MODELS_DIR, "feature_info.pkl")) print("All components loaded successfully!") print(f" Model type: {type(model.named_steps['classifier']).__name__}") print(f" Classes: {label_encoder.classes_}") except Exception as e: print(f"Error loading models: {e}") model = None def build_features_df(features: dict) -> pd.DataFrame: """Build a model-ready DataFrame with the same feature order used in training.""" all_features = feature_info.get('all_features', list(features.keys())) categorical_cols = set(feature_info.get('categorical_cols', [])) row = {} for col in all_features: if col in features: row[col] = features[col] elif col in categorical_cols: row[col] = 'Other' else: row[col] = 0.0 return pd.DataFrame([row], columns=all_features) def pick_class_with_thresholds(probabilities: np.ndarray) -> int: """Use thresholds when available, otherwise fall back to argmax probability.""" classes = label_encoder.classes_ base_idx = int(np.argmax(probabilities)) eligible = [ i for i, class_name in enumerate(classes) if probabilities[i] >= thresholds.get(class_name, 0.5) ] if not eligible: return base_idx return max(eligible, key=lambda i: probabilities[i]) # MongoDB connection (optional for now) # try: # MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017/') # client = MongoClient(MONGO_URI) # db = client['readability_db'] # # Test connection # client.admin.command('ping') # print("MongoDB connected") # except: # print("MongoDB not available - continuing without database") # db = None @app.route('/health', methods=['GET']) @app.route('/api/health', methods=['GET']) def health(): model_name = type(model.named_steps['classifier']).__name__ if model is not None else None classes = label_encoder.classes_.tolist() if model is not None else [] return jsonify({ 'status': 'healthy' if model is not None else 'degraded', 'model': model_name, 'classes': classes }), 200 @app.route('/api/predict', methods=['POST']) def predict(): if model is None: return jsonify({'error': 'Model not loaded. Check server logs.'}), 503 try: data = request.get_json() text = data.get('text', '').strip() if not text: return jsonify({'error': 'No text provided'}), 400 if len(text) < 10: return jsonify({'error': 'Text must be at least 10 characters'}), 400 # Extract features features = extract_features_web(text) if not features: return jsonify({'error': 'Feature extraction failed. Check server logs.'}), 500 # Convert to DataFrame using training-time feature order. features_df = build_features_df(features) # Get prediction probabilities = model.predict_proba(features_df)[0] # Apply threshold tuning with deterministic tie handling. final_prediction = pick_class_with_thresholds(probabilities) predicted_class = label_encoder.classes_[final_prediction] grade_level = grade_mapping.get(predicted_class, predicted_class) # Prepare response response = { 'success': True, 'text': text[:200] + '...' if len(text) > 200 else text, 'prediction': { 'predicted_class': predicted_class, 'grade_level': grade_level, 'confidences': { class_name: float(probabilities[i]) for i, class_name in enumerate(label_encoder.classes_) } }, 'features': {k: float(v) if isinstance(v, (int, float)) else v for k, v in features.items()} } # # Store in MongoDB if available # if db: # db.texts.insert_one({ # 'text': text, # 'prediction': response['prediction'], # 'features': features, # 'timestamp': datetime.utcnow() # }) return jsonify(response), 200 except Exception as e: return jsonify({'error': str(e)}), 500 @app.route('/api/predict/batch', methods=['POST']) def batch_predict(): try: data = request.get_json() texts = data.get('texts', []) if not texts: return jsonify({'error': 'No texts provided'}), 400 results = [] for text in texts: features = extract_features_web(text) features_df = build_features_df(features) probabilities = model.predict_proba(features_df)[0] prediction = pick_class_with_thresholds(probabilities) predicted_class = label_encoder.classes_[prediction] results.append({ 'text': text[:100] + '...' if len(text) > 100 else text, 'prediction': { 'class': predicted_class, 'grade': grade_mapping.get(predicted_class, predicted_class), 'confidences': { class_name: float(probabilities[i]) for i, class_name in enumerate(label_encoder.classes_) } } }) return jsonify({ 'success': True, 'count': len(results), 'results': results }), 200 except Exception as e: return jsonify({'error': str(e)}), 500 if __name__ == '__main__': # Hugging Face Spaces uses port 7860 port = int(os.getenv('PORT', 7860)) app.run(host='0.0.0.0', port=port)