Spaces:

AbdoIR
/

x-sentiment-analysis

Running

File size: 4,186 Bytes

fe5868d
280ba73
 
297a341
 
 
 
 
 
 
89a9563
a8a907b
280ba73
 
 
 
 
 
 
 
 
 
297a341
df5115f
a12b972
 
297a341
 
 
 
 
280ba73
df5115f
297a341
df5115f
 
 
 
 
 
 
 
280ba73
df5115f
297a341
df5115f
 
 
0315d54
 
df5115f
 
 
280ba73
df5115f
 
 
 
 
 
297a341
4672373
 
 
 
 
 
 
df5115f
280ba73
 
 
df5115f
280ba73
df5115f
 
280ba73
 
df5115f
280ba73
df5115f
 
 
280ba73
df5115f
 
 
a516d6e
df5115f
 
 
 
 
 
280ba73
 
df5115f
 
280ba73
df5115f
 
 
 
 
 
 
280ba73
 
 
df5115f
280ba73
df5115f
 
280ba73
 
df5115f
280ba73
df5115f
 
 
280ba73
df5115f
 
 
 
280ba73
 
297a341
 
280ba73

from fastapi import FastAPI, UploadFile, File, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, JSONResponse
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import pandas as pd
import torch
import tempfile
import os
import re
from collections import Counter
import datetime

app = FastAPI()

# Enable CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Load model
model = DistilBertForSequenceClassification.from_pretrained("./fine_tuned_model")
tokenizer = DistilBertTokenizer.from_pretrained("./fine_tuned_model")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


# Sentiment prediction
def predict_sentiment(texts):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
    encodings = {key: val.to(device) for key, val in encodings.items()}
    with torch.no_grad():
        outputs = model(**encodings)
        predictions = torch.argmax(outputs.logits, dim=1)
    sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return [sentiment_map[p.item()] for p in predictions]


# Frequent words
def get_top_words(texts, n=30):
    all_words = []
    for text in texts:
        tokens = re.findall(r'\b\w{3,}\b', str(text).lower())
        all_words.extend(tokens)
    counter = Counter(all_words)
    most_common = counter.most_common(n)
    return pd.DataFrame(most_common, columns=['word', 'count'])


# Identify column
def get_text_column(df):
    for col in ['content', 'tweet', 'text']:
        if col in df.columns:
            return col
    return None

@app.api_route("/", methods=["GET", "HEAD"])
async def index(request: Request):
    return JSONResponse({
        "status": "ok",
        "message": "Server is alive",
        "timestamp": datetime.datetime.utcnow().isoformat() + "Z"
    })

# POST /predict
@app.post("/predict")
async def predict(file: UploadFile = File(...)):
    try:
        df = pd.read_csv(file.file)
    except Exception:
        try:
            file.file.seek(0)
            df = pd.read_excel(file.file)
        except Exception:
            raise HTTPException(status_code=400, detail="Unable to read the file")

    text_col = get_text_column(df)
    if not text_col:
        raise HTTPException(status_code=400, detail='No "content", "tweet", or "text" column found')

    texts = df[text_col].astype(str).tolist()
    df['sentiment'] = predict_sentiment(texts)
    df['content_length'] = df[text_col].astype(str).apply(len)

    top_words_df = get_top_words(texts)

    temp_dir = tempfile.mkdtemp()
    sentiment_path = os.path.join(temp_dir, 'final_data.csv')
    words_path = os.path.join(temp_dir, 'word_frequent.csv')

    df.to_csv(sentiment_path, index=False)
    top_words_df.to_csv(words_path, index=False)

    return JSONResponse({
        'sentiment_file': f'/download?file={sentiment_path}',
        'top_words_file': f'/download?file={words_path}',
        'sentiment_data': df.to_dict(orient='records'),
        'top_words_data': top_words_df.to_dict(orient='records')
    })


# POST /wordcloud
@app.post("/wordcloud")
async def wordcloud(file: UploadFile = File(...)):
    try:
        df = pd.read_csv(file.file)
    except Exception:
        try:
            file.file.seek(0)
            df = pd.read_excel(file.file)
        except Exception:
            raise HTTPException(status_code=400, detail="Unable to read the file")

    text_col = get_text_column(df)
    if not text_col:
        raise HTTPException(status_code=400, detail='No "content", "tweet", or "text" column found')

    texts = df[text_col].astype(str).tolist()
    top_words_df = get_top_words(texts)

    return JSONResponse({'top_words_data': top_words_df.to_dict(orient='records')})


# GET /download
@app.get("/download")
async def download(file: str):
    if not file or not os.path.exists(file):
        raise HTTPException(status_code=404, detail="File not found")
    return FileResponse(file, filename=os.path.basename(file))