Spaces:

Jandayl
/

Alalay

Sleeping

File size: 7,147 Bytes

from flask import Flask, request, jsonify
from flask_cors import CORS
import joblib
import pandas as pd
import numpy as np
from feature_extractor_web import extract_features_web
import logging
import os

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MODELS_DIR = os.path.join(BASE_DIR, "models")

app = Flask(__name__)
default_origins = "https://alalayfe.vercel.app,https://www.alalayfe.vercel.app,http://localhost:3000"
ALLOWED_ORIGINS = [origin.strip().rstrip("/") for origin in os.getenv('ALLOWED_ORIGINS', default_origins).split(',') if origin.strip()]
CORS(app, origins=ALLOWED_ORIGINS)
logger = logging.getLogger(__name__)

@app.route('/')
def home():
    return jsonify({
        'service': 'Alalay Readability API',
        'status': 'running',
        'endpoints': [
            '/health',
            '/api/predict',
            '/api/predict/batch'
        ]
    }), 200

# Load all the saved components
print("Loading model components...")
try:
    model = joblib.load(os.path.join(MODELS_DIR, "readability_model.pkl"))
    label_encoder = joblib.load(os.path.join(MODELS_DIR, "label_encoder.pkl"))
    grade_mapping = joblib.load(os.path.join(MODELS_DIR, "grade_mapping.pkl"))
    thresholds = joblib.load(os.path.join(MODELS_DIR, "thresholds.pkl"))
    feature_info = joblib.load(os.path.join(MODELS_DIR, "feature_info.pkl"))
    print("All components loaded successfully!")
    print(f"   Model type: {type(model.named_steps['classifier']).__name__}")
    print(f"   Classes: {label_encoder.classes_}")
except Exception as e:
    print(f"Error loading models: {e}")
    model = None


def build_features_df(features: dict) -> pd.DataFrame:
    """Build a model-ready DataFrame with the same feature order used in training."""
    all_features = feature_info.get('all_features', list(features.keys()))
    categorical_cols = set(feature_info.get('categorical_cols', []))

    row = {}
    for col in all_features:
        if col in features:
            row[col] = features[col]
        elif col in categorical_cols:
            row[col] = 'Other'
        else:
            row[col] = 0.0

    return pd.DataFrame([row], columns=all_features)


def pick_class_with_thresholds(probabilities: np.ndarray) -> int:
    """Use thresholds when available, otherwise fall back to argmax probability."""
    classes = label_encoder.classes_
    base_idx = int(np.argmax(probabilities))

    eligible = [
        i for i, class_name in enumerate(classes)
        if probabilities[i] >= thresholds.get(class_name, 0.5)
    ]
    if not eligible:
        return base_idx

    return max(eligible, key=lambda i: probabilities[i])

# MongoDB connection (optional for now)
# try:
#     MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017/')
#     client = MongoClient(MONGO_URI)
#     db = client['readability_db']
#     # Test connection
#     client.admin.command('ping')
#     print("MongoDB connected")
# except:
#     print("MongoDB not available - continuing without database")
#     db = None

@app.route('/health', methods=['GET'])
@app.route('/api/health', methods=['GET'])
def health():
    model_name = type(model.named_steps['classifier']).__name__ if model is not None else None
    classes = label_encoder.classes_.tolist() if model is not None else []
    return jsonify({
        'status': 'healthy' if model is not None else 'degraded',
        'model': model_name,
        'classes': classes
    }), 200

@app.route('/api/predict', methods=['POST'])
def predict():
    if model is None:
        return jsonify({'error': 'Model not loaded. Check server logs.'}), 503
    try:
        data = request.get_json()
        text = data.get('text', '').strip()
        
        if not text:
            return jsonify({'error': 'No text provided'}), 400
        
        if len(text) < 10:
            return jsonify({'error': 'Text must be at least 10 characters'}), 400
        
        # Extract features
        features = extract_features_web(text)
        if not features:
            return jsonify({'error': 'Feature extraction failed. Check server logs.'}), 500

        # Convert to DataFrame using training-time feature order.
        features_df = build_features_df(features)
        
        # Get prediction
        probabilities = model.predict_proba(features_df)[0]
        
        # Apply threshold tuning with deterministic tie handling.
        final_prediction = pick_class_with_thresholds(probabilities)
        
        predicted_class = label_encoder.classes_[final_prediction]
        grade_level = grade_mapping.get(predicted_class, predicted_class)
        
        # Prepare response
        response = {
            'success': True,
            'text': text[:200] + '...' if len(text) > 200 else text,
            'prediction': {
                'predicted_class': predicted_class,
                'grade_level': grade_level,
                'confidences': {
                    class_name: float(probabilities[i])
                    for i, class_name in enumerate(label_encoder.classes_)
                }
            },
            'features': {k: float(v) if isinstance(v, (int, float)) else v 
                        for k, v in features.items()}
        }
        
        # # Store in MongoDB if available
        # if db:
        #     db.texts.insert_one({
        #         'text': text,
        #         'prediction': response['prediction'],
        #         'features': features,
        #         'timestamp': datetime.utcnow()
        #     })
        
        return jsonify(response), 200
        
    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/api/predict/batch', methods=['POST'])
def batch_predict():
    try:
        data = request.get_json()
        texts = data.get('texts', [])
        
        if not texts:
            return jsonify({'error': 'No texts provided'}), 400
        
        results = []
        for text in texts:
            features = extract_features_web(text)
            features_df = build_features_df(features)
            probabilities = model.predict_proba(features_df)[0]
            prediction = pick_class_with_thresholds(probabilities)
            
            predicted_class = label_encoder.classes_[prediction]
            
            results.append({
                'text': text[:100] + '...' if len(text) > 100 else text,
                'prediction': {
                    'class': predicted_class,
                    'grade': grade_mapping.get(predicted_class, predicted_class),
                    'confidences': {
                        class_name: float(probabilities[i])
                        for i, class_name in enumerate(label_encoder.classes_)
                    }
                }
            })
        
        return jsonify({
            'success': True,
            'count': len(results),
            'results': results
        }), 200
        
    except Exception as e:
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    # Hugging Face Spaces uses port 7860
    port = int(os.getenv('PORT', 7860))
    app.run(host='0.0.0.0', port=port)