import os import joblib import pandas as pd from fastapi import FastAPI, Form, HTTPException, Request from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import SGDClassifier from sklearn.pipeline import Pipeline from sklearn.exceptions import NotFittedError # --- 1. Basic Setup & Configuration --- # Hugging Face provides persistent storage at the root of the project USER_MODELS_DIR = "user_models_data" os.makedirs(USER_MODELS_DIR, exist_ok=True) app = FastAPI() # --- Label Mapping --- label_mapping = { 'positive_sentiment': 0, 'negative_sentiment': 1, 'greeting': 2, 'farewell': 3, 'thanks': 4, 'searching_inquiry': 5 } reverse_label_mapping = {v: k for k, v in label_mapping.items()} # --- 2. Helper Functions --- def get_user_paths(user_id: str): user_dir = os.path.join(USER_MODELS_DIR, user_id) os.makedirs(user_dir, exist_ok=True) return { "model_path": os.path.join(user_dir, "model.joblib"), "data_path": os.path.join(user_dir, "training_data.csv") } # --- 3. API Endpoints --- @app.get("/") def read_root(): return {"message": "Welcome! Your AI is running on Hugging Face Spaces."} @app.post("/predict/") async def predict(user_id: str = Form(...), text: str = Form(...)): paths = get_user_paths(user_id) if not os.path.exists(paths["model_path"]): raise HTTPException(status_code=404, detail="Model not found. Please train it first.") model_pipeline = joblib.load(paths["model_path"]) try: predicted_index = model_pipeline.predict([text])[0] probabilities = model_pipeline.predict_proba([text])[0] predicted_label = reverse_label_mapping[predicted_index] confidence = float(probabilities[predicted_index]) return {"intent": predicted_label, "confidence": confidence} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.post("/train/") async def train(user_id: str = Form(...), text: str = Form(...), label: str = Form(...)): if label not in label_mapping: raise HTTPException(status_code=400, detail="Invalid label.") paths = get_user_paths(user_id) new_data = pd.DataFrame([{"text": text, "label": label}]) if os.path.exists(paths["data_path"]): new_data.to_csv(paths["data_path"], mode='a', header=False, index=False) else: new_data.to_csv(paths["data_path"], mode='w', header=True, index=False) df = pd.read_csv(paths["data_path"]) if len(df['label'].unique()) < 2: return {"message": "Model not trained. Please provide at least two different categories of examples."} df['label_numeric'] = df['label'].map(label_mapping) X = df['text'] y = df['label_numeric'] model_pipeline = Pipeline([ ('tfidf', TfidfVectorizer()), ('clf', SGDClassifier(loss='modified_huber', random_state=42)), ]) model_pipeline.fit(X, y) joblib.dump(model_pipeline, paths["model_path"]) return {"message": f"Training successful for user '{user_id}'."}