Yousuf-Islam's picture
Rename app.py to main.py
acf5a33 verified
import re
import string
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline
# --- 1. Define the Cleaning Function (No Pandas needed) ---
def clean_text(text: str) -> str:
"""
Comprehensive text cleaning for Bengali text.
Optimized for API usage (removed pandas dependency).
"""
if not text or not isinstance(text, str):
return ""
# Remove URLs
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
# Remove Emails
text = re.sub(r'\S+@\S+', '', text)
# Remove Mentions/Hashtags
text = re.sub(r'@\w+|#\w+', '', text)
# Remove Emojis (Unicode range)
text = re.sub(r'[\U00010000-\U0010ffff]', '', text)
# Remove Punctuation (English + Bengali Dari/Double Dari)
# Note: Ensure these characters are correct for your needs
exclude_chars = string.punctuation + "।॥''"""
text = text.translate(str.maketrans('', '', exclude_chars))
# Normalize whitespace
text = ' '.join(text.split())
# Remove very short texts
if len(text.strip()) < 2:
return ""
return text.strip()
# --- 2. Initialize App & Model ---
app = FastAPI()
# Load your model here (ensure files are uploaded if local)
# classifier = pipeline("text-classification", model="./my_model_path")
# Or if testing with a generic one:
classifier = pipeline("sentiment-analysis")
# --- 3. Define Input Structure ---
class TextInput(BaseModel):
text: str
# --- 4. Define Endpoints ---
@app.get("/")
def home():
return {"message": "Bengali Model API is running"}
@app.post("/predict")
def predict(input_data: TextInput):
# STEP 1: Clean the input
cleaned_text = clean_text(input_data.text)
# STEP 2: Handle empty results (if cleaning removed everything)
if not cleaned_text:
return {
"error": "Input text contained only noise (urls, emojis, etc.)",
"original_input": input_data.text,
"cleaned_input": ""
}
# STEP 3: Pass CLEANED text to model
prediction = classifier(cleaned_text)
return {
"original_input": input_data.text,
"cleaned_input": cleaned_text,
"prediction": prediction
}