Spaces:
Runtime error
Runtime error
Add application file
Browse files- .dockerignore +10 -0
- .gitignore +26 -0
- Dockerfile +34 -0
- main.py +61 -0
- metrics/__init__.py +16 -0
- metrics/bertscore.py +48 -0
- metrics/bleu.py +27 -0
- metrics/chrf.py +27 -0
- metrics/comet.py +53 -0
- metrics/rouge.py +35 -0
- models.py +40 -0
- requirements.txt +17 -0
- routes.py +76 -0
- utils.py +46 -0
.dockerignore
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
venv
|
| 2 |
+
__pycache__
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
.git
|
| 7 |
+
.gitignore
|
| 8 |
+
node_modules
|
| 9 |
+
.env
|
| 10 |
+
.cache
|
.gitignore
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.DS_Store
|
| 2 |
+
node_modules/
|
| 3 |
+
dist/
|
| 4 |
+
build/
|
| 5 |
+
*.log
|
| 6 |
+
.env
|
| 7 |
+
.env.local
|
| 8 |
+
__pycache__/
|
| 9 |
+
*.py[cod]
|
| 10 |
+
*$py.class
|
| 11 |
+
*.so
|
| 12 |
+
.Python
|
| 13 |
+
env/
|
| 14 |
+
venv/
|
| 15 |
+
ENV/
|
| 16 |
+
.vscode/
|
| 17 |
+
.idea/
|
| 18 |
+
*.swp
|
| 19 |
+
*.swo
|
| 20 |
+
*~
|
| 21 |
+
.pytest_cache/
|
| 22 |
+
.coverage
|
| 23 |
+
htmlcov/
|
| 24 |
+
.venv
|
| 25 |
+
pip-log.txt
|
| 26 |
+
pip-delete-this-directory.txt
|
Dockerfile
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use Python base image
|
| 2 |
+
FROM python:3.12-slim
|
| 3 |
+
|
| 4 |
+
# Prevent Python from writing .pyc files
|
| 5 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 6 |
+
|
| 7 |
+
# Prevent Python from buffering stdout/stderr
|
| 8 |
+
ENV PYTHONUNBUFFERED=1
|
| 9 |
+
|
| 10 |
+
# Set working directory
|
| 11 |
+
WORKDIR /app
|
| 12 |
+
|
| 13 |
+
# Install system dependencies needed by some ML libraries
|
| 14 |
+
RUN apt-get update && apt-get install -y \
|
| 15 |
+
build-essential \
|
| 16 |
+
git \
|
| 17 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
+
|
| 19 |
+
# Copy requirements first (better caching)
|
| 20 |
+
COPY requirements.txt .
|
| 21 |
+
|
| 22 |
+
RUN pip install --upgrade pip
|
| 23 |
+
RUN pip install setuptools==69.5.1
|
| 24 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
+
|
| 26 |
+
RUN python -c "from comet import download_model; download_model('Unbabel/wmt20-comet-da')"
|
| 27 |
+
# Copy project files
|
| 28 |
+
COPY . .
|
| 29 |
+
|
| 30 |
+
# Expose API port
|
| 31 |
+
EXPOSE 8000
|
| 32 |
+
|
| 33 |
+
# Run FastAPI
|
| 34 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
main.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Real-Time Multilingual Metric Evaluator — Backend
|
| 3 |
+
FastAPI server that computes ROUGE, BERTScore, and COMET scores live.
|
| 4 |
+
|
| 5 |
+
Install dependencies:
|
| 6 |
+
pip install -r requirements.txt
|
| 7 |
+
|
| 8 |
+
Run:
|
| 9 |
+
uvicorn main:app --reload --port 8000
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from fastapi import FastAPI
|
| 13 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 14 |
+
from fastapi.responses import JSONResponse
|
| 15 |
+
from fastapi.exceptions import RequestValidationError
|
| 16 |
+
from routes import router
|
| 17 |
+
|
| 18 |
+
# Create FastAPI app
|
| 19 |
+
app = FastAPI(title="Multilingual Metric Evaluator API")
|
| 20 |
+
|
| 21 |
+
# Allow requests from the React frontend and local dev
|
| 22 |
+
app.add_middleware(
|
| 23 |
+
CORSMiddleware,
|
| 24 |
+
allow_origins=[
|
| 25 |
+
"https://metric-evaluator.vercel.app",
|
| 26 |
+
"http://localhost:5173",
|
| 27 |
+
"http://localhost:3000",
|
| 28 |
+
"http://127.0.0.1:5173",
|
| 29 |
+
"http://127.0.0.1:3000",
|
| 30 |
+
],
|
| 31 |
+
allow_credentials=True,
|
| 32 |
+
allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
|
| 33 |
+
allow_headers=["*"],
|
| 34 |
+
max_age=3600,
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Custom exception handler to ensure CORS headers are always sent
|
| 38 |
+
@app.exception_handler(Exception)
|
| 39 |
+
async def global_exception_handler(request, exc):
|
| 40 |
+
return JSONResponse(
|
| 41 |
+
status_code=500,
|
| 42 |
+
content={"detail": str(exc)},
|
| 43 |
+
headers={
|
| 44 |
+
"Access-Control-Allow-Origin": request.headers.get("origin", "*"),
|
| 45 |
+
"Access-Control-Allow-Methods": "GET, POST, PUT, DELETE, OPTIONS",
|
| 46 |
+
"Access-Control-Allow-Headers": "*",
|
| 47 |
+
}
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# health check endpoint
|
| 51 |
+
@app.get("/health")
|
| 52 |
+
def health():
|
| 53 |
+
return {"status": "ok"}
|
| 54 |
+
|
| 55 |
+
# Include routes
|
| 56 |
+
app.include_router(router)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
if __name__ == "__main__":
|
| 60 |
+
import uvicorn
|
| 61 |
+
uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
|
metrics/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Metrics package for NLP evaluation
|
| 3 |
+
"""
|
| 4 |
+
from .rouge import compute_rouge
|
| 5 |
+
from .bertscore import compute_bertscore
|
| 6 |
+
from .comet import compute_comet
|
| 7 |
+
from .chrf import compute_chrf
|
| 8 |
+
from .bleu import compute_bleu
|
| 9 |
+
|
| 10 |
+
__all__ = [
|
| 11 |
+
"compute_rouge",
|
| 12 |
+
"compute_bertscore",
|
| 13 |
+
"compute_comet",
|
| 14 |
+
"compute_chrf",
|
| 15 |
+
"compute_bleu",
|
| 16 |
+
]
|
metrics/bertscore.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BERTScore metric computation
|
| 3 |
+
"""
|
| 4 |
+
import time
|
| 5 |
+
from models import MetricResult
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def compute_bertscore(hypothesis: str, reference: str, language: str) -> MetricResult:
|
| 9 |
+
"""Compute BERTScore"""
|
| 10 |
+
t0 = time.time()
|
| 11 |
+
try:
|
| 12 |
+
import bert_score
|
| 13 |
+
# Map common language codes to BERTScore model names
|
| 14 |
+
lang_model_map = {
|
| 15 |
+
"ar": "asafaya/bert-base-arabic",
|
| 16 |
+
"he": "avichr/heBERT",
|
| 17 |
+
"zh": "bert-base-chinese",
|
| 18 |
+
"ja": "cl-tohoku/bert-base-japanese-v3",
|
| 19 |
+
"tr": "dbmdz/bert-base-turkish-cased",
|
| 20 |
+
"es": "dccuchile/bert-base-spanish-wwm-cased",
|
| 21 |
+
}
|
| 22 |
+
model = lang_model_map.get(language, "bert-base-multilingual-cased")
|
| 23 |
+
|
| 24 |
+
P, R, F = bert_score.score(
|
| 25 |
+
[hypothesis], [reference],
|
| 26 |
+
model_type=model,
|
| 27 |
+
lang=language,
|
| 28 |
+
verbose=False,
|
| 29 |
+
)
|
| 30 |
+
f = float(F[0])
|
| 31 |
+
return MetricResult(
|
| 32 |
+
name="BERTScore",
|
| 33 |
+
score=round(f, 4),
|
| 34 |
+
subscores={
|
| 35 |
+
"precision": round(float(P[0]), 4),
|
| 36 |
+
"recall": round(float(R[0]), 4),
|
| 37 |
+
"f1": round(f, 4),
|
| 38 |
+
"model": model,
|
| 39 |
+
},
|
| 40 |
+
duration_ms=round((time.time() - t0) * 1000, 1),
|
| 41 |
+
)
|
| 42 |
+
except Exception as e:
|
| 43 |
+
return MetricResult(
|
| 44 |
+
name="BERTScore",
|
| 45 |
+
score=0.0,
|
| 46 |
+
error=str(e),
|
| 47 |
+
duration_ms=round((time.time() - t0) * 1000, 1)
|
| 48 |
+
)
|
metrics/bleu.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BLEU metric computation
|
| 3 |
+
"""
|
| 4 |
+
import time
|
| 5 |
+
from models import MetricResult
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def compute_bleu(hypothesis: str, reference: str) -> MetricResult:
|
| 9 |
+
"""BLEU score"""
|
| 10 |
+
t0 = time.time()
|
| 11 |
+
try:
|
| 12 |
+
from sacrebleu.metrics import BLEU
|
| 13 |
+
metric = BLEU(effective_order=True)
|
| 14 |
+
score = metric.sentence_score(hypothesis, [reference]).score / 100.0
|
| 15 |
+
return MetricResult(
|
| 16 |
+
name="BLEU",
|
| 17 |
+
score=round(score, 4),
|
| 18 |
+
subscores={"bleu": round(score, 4)},
|
| 19 |
+
duration_ms=round((time.time() - t0) * 1000, 1),
|
| 20 |
+
)
|
| 21 |
+
except Exception as e:
|
| 22 |
+
return MetricResult(
|
| 23 |
+
name="BLEU",
|
| 24 |
+
score=0.0,
|
| 25 |
+
error=str(e),
|
| 26 |
+
duration_ms=round((time.time() - t0) * 1000, 1)
|
| 27 |
+
)
|
metrics/chrf.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CHRF metric computation
|
| 3 |
+
"""
|
| 4 |
+
import time
|
| 5 |
+
from models import MetricResult
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def compute_chrf(hypothesis: str, reference: str) -> MetricResult:
|
| 9 |
+
"""Character n-gram F-score"""
|
| 10 |
+
t0 = time.time()
|
| 11 |
+
try:
|
| 12 |
+
from sacrebleu.metrics import CHRF
|
| 13 |
+
metric = CHRF()
|
| 14 |
+
score = metric.sentence_score(hypothesis, [reference]).score / 100.0
|
| 15 |
+
return MetricResult(
|
| 16 |
+
name="CHRF",
|
| 17 |
+
score=round(score, 4),
|
| 18 |
+
subscores={"chrf": round(score, 4)},
|
| 19 |
+
duration_ms=round((time.time() - t0) * 1000, 1),
|
| 20 |
+
)
|
| 21 |
+
except Exception as e:
|
| 22 |
+
return MetricResult(
|
| 23 |
+
name="CHRF",
|
| 24 |
+
score=0.0,
|
| 25 |
+
error=str(e),
|
| 26 |
+
duration_ms=round((time.time() - t0) * 1000, 1)
|
| 27 |
+
)
|
metrics/comet.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
COMET metric computation
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import time
|
| 6 |
+
from typing import Optional
|
| 7 |
+
from models import MetricResult
|
| 8 |
+
from utils import get_comet
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def compute_comet(hypothesis: str, reference: str, source: Optional[str]) -> MetricResult:
|
| 12 |
+
"""Compute COMET score"""
|
| 13 |
+
|
| 14 |
+
t0 = time.time()
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
model = get_comet()
|
| 18 |
+
|
| 19 |
+
if model is None:
|
| 20 |
+
raise RuntimeError("COMET model not loaded — check installation.")
|
| 21 |
+
|
| 22 |
+
# COMET expects source text
|
| 23 |
+
src = source if source else reference
|
| 24 |
+
|
| 25 |
+
data = [{
|
| 26 |
+
"src": src,
|
| 27 |
+
"mt": hypothesis,
|
| 28 |
+
"ref": reference
|
| 29 |
+
}]
|
| 30 |
+
|
| 31 |
+
output = model.predict(
|
| 32 |
+
data,
|
| 33 |
+
batch_size=1,
|
| 34 |
+
accelerator="cpu",
|
| 35 |
+
progress_bar=False
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
seg_score = float(output["scores"][0])
|
| 39 |
+
|
| 40 |
+
return MetricResult(
|
| 41 |
+
name="COMET",
|
| 42 |
+
score=round(seg_score, 4),
|
| 43 |
+
subscores={"segment_score": round(seg_score, 4)},
|
| 44 |
+
duration_ms=round((time.time() - t0) * 1000, 1),
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
except Exception as e:
|
| 48 |
+
return MetricResult(
|
| 49 |
+
name="COMET",
|
| 50 |
+
score=0.0,
|
| 51 |
+
error=str(e),
|
| 52 |
+
duration_ms=round((time.time() - t0) * 1000, 1),
|
| 53 |
+
)
|
metrics/rouge.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ROUGE metric computation
|
| 3 |
+
"""
|
| 4 |
+
import time
|
| 5 |
+
from models import MetricResult
|
| 6 |
+
from utils import get_rouge
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def compute_rouge(hypothesis: str, reference: str) -> MetricResult:
|
| 10 |
+
"""Compute ROUGE score"""
|
| 11 |
+
t0 = time.time()
|
| 12 |
+
try:
|
| 13 |
+
scorer = get_rouge()
|
| 14 |
+
scores = scorer.score(reference, hypothesis)
|
| 15 |
+
r1 = scores["rouge1"].fmeasure
|
| 16 |
+
r2 = scores["rouge2"].fmeasure
|
| 17 |
+
rL = scores["rougeL"].fmeasure
|
| 18 |
+
overall = (r1 + r2 + rL) / 3
|
| 19 |
+
return MetricResult(
|
| 20 |
+
name="ROUGE",
|
| 21 |
+
score=round(overall, 4),
|
| 22 |
+
subscores={
|
| 23 |
+
"rouge1": round(r1, 4),
|
| 24 |
+
"rouge2": round(r2, 4),
|
| 25 |
+
"rougeL": round(rL, 4),
|
| 26 |
+
},
|
| 27 |
+
duration_ms=round((time.time() - t0) * 1000, 1),
|
| 28 |
+
)
|
| 29 |
+
except Exception as e:
|
| 30 |
+
return MetricResult(
|
| 31 |
+
name="ROUGE",
|
| 32 |
+
score=0.0,
|
| 33 |
+
error=str(e),
|
| 34 |
+
duration_ms=round((time.time() - t0) * 1000, 1)
|
| 35 |
+
)
|
models.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pydantic models for request/response validation
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
from typing import Optional, List, Dict
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class EvalRequest(BaseModel):
|
| 10 |
+
"""Request model for evaluation endpoint"""
|
| 11 |
+
|
| 12 |
+
hypothesis: str # generated summary
|
| 13 |
+
reference: str # reference summary
|
| 14 |
+
source: Optional[str] = None # original article (used by COMET)
|
| 15 |
+
|
| 16 |
+
language: str = "en"
|
| 17 |
+
|
| 18 |
+
metrics: List[str] = Field(
|
| 19 |
+
default_factory=lambda: ["rouge", "bertscore", "comet"]
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class MetricResult(BaseModel):
|
| 24 |
+
"""Result for a single metric"""
|
| 25 |
+
|
| 26 |
+
name: str
|
| 27 |
+
score: float
|
| 28 |
+
|
| 29 |
+
subscores: Dict = Field(default_factory=dict)
|
| 30 |
+
|
| 31 |
+
error: Optional[str] = None
|
| 32 |
+
duration_ms: float = 0.0
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class EvalResponse(BaseModel):
|
| 36 |
+
"""Response model for evaluation endpoint"""
|
| 37 |
+
|
| 38 |
+
results: List[MetricResult]
|
| 39 |
+
language: str
|
| 40 |
+
total_ms: float
|
requirements.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
setuptools==69.5.1
|
| 2 |
+
|
| 3 |
+
fastapi==0.111.0
|
| 4 |
+
uvicorn[standard]==0.29.0
|
| 5 |
+
|
| 6 |
+
rouge-score==0.1.2
|
| 7 |
+
bert-score==0.3.13
|
| 8 |
+
unbabel-comet==2.2.7
|
| 9 |
+
|
| 10 |
+
torch==2.2.2 --index-url https://download.pytorch.org/whl/cpu
|
| 11 |
+
transformers==4.38.2
|
| 12 |
+
sacrebleu==2.6.0
|
| 13 |
+
|
| 14 |
+
pydantic>=2.0.0
|
| 15 |
+
sentencepiece
|
| 16 |
+
protobuf
|
| 17 |
+
sympy==1.12
|
routes.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
API routes for the metric evaluator
|
| 3 |
+
"""
|
| 4 |
+
import time
|
| 5 |
+
from fastapi import APIRouter, HTTPException
|
| 6 |
+
from models import EvalRequest, EvalResponse
|
| 7 |
+
from metrics import compute_rouge, compute_bertscore, compute_comet, compute_chrf, compute_bleu
|
| 8 |
+
|
| 9 |
+
router = APIRouter()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@router.get("/")
|
| 13 |
+
def root():
|
| 14 |
+
"""Root endpoint"""
|
| 15 |
+
return {"status": "ok", "message": "Multilingual Metric Evaluator API"}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@router.get("/health")
|
| 19 |
+
def health():
|
| 20 |
+
"""Health check endpoint"""
|
| 21 |
+
return {"status": "healthy"}
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@router.post("/evaluate", response_model=EvalResponse)
|
| 25 |
+
def evaluate(req: EvalRequest):
|
| 26 |
+
"""Main evaluation endpoint"""
|
| 27 |
+
t_total = time.time()
|
| 28 |
+
results = []
|
| 29 |
+
|
| 30 |
+
metric_set = set(m.lower() for m in req.metrics)
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
if "rouge" in metric_set:
|
| 34 |
+
results.append(compute_rouge(req.hypothesis, req.reference))
|
| 35 |
+
|
| 36 |
+
if "bertscore" in metric_set:
|
| 37 |
+
try:
|
| 38 |
+
results.append(compute_bertscore(req.hypothesis, req.reference, req.language))
|
| 39 |
+
except Exception as e:
|
| 40 |
+
raise HTTPException(status_code=500, detail=f"BERTScore error: {str(e)}")
|
| 41 |
+
|
| 42 |
+
if "comet" in metric_set:
|
| 43 |
+
try:
|
| 44 |
+
results.append(compute_comet(req.hypothesis, req.reference, req.source))
|
| 45 |
+
except Exception as e:
|
| 46 |
+
raise HTTPException(status_code=500, detail=f"COMET error: {str(e)}")
|
| 47 |
+
|
| 48 |
+
if "chrf" in metric_set:
|
| 49 |
+
results.append(compute_chrf(req.hypothesis, req.reference))
|
| 50 |
+
|
| 51 |
+
if "bleu" in metric_set:
|
| 52 |
+
results.append(compute_bleu(req.hypothesis, req.reference))
|
| 53 |
+
|
| 54 |
+
return EvalResponse(
|
| 55 |
+
results=results,
|
| 56 |
+
language=req.language,
|
| 57 |
+
total_ms=round((time.time() - t_total) * 1000, 1),
|
| 58 |
+
)
|
| 59 |
+
except HTTPException:
|
| 60 |
+
raise
|
| 61 |
+
except Exception as e:
|
| 62 |
+
raise HTTPException(status_code=500, detail=f"Evaluation error: {str(e)}")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@router.get("/metrics")
|
| 66 |
+
def list_metrics():
|
| 67 |
+
"""Returns all available metrics and their descriptions"""
|
| 68 |
+
return {
|
| 69 |
+
"metrics": [
|
| 70 |
+
{"id": "rouge", "name": "ROUGE", "type": "N-Gram", "description": "Recall-Oriented Understudy for Gisting Evaluation. Measures n-gram overlap between hypothesis and reference."},
|
| 71 |
+
{"id": "bertscore", "name": "BERTScore", "type": "Neural", "description": "Computes similarity using BERT token embeddings. Uses language-specific models when available."},
|
| 72 |
+
{"id": "comet", "name": "COMET", "type": "Neural", "description": "Trained neural metric with human quality score regression. Best overall correlation across language families."},
|
| 73 |
+
{"id": "chrf", "name": "CHRF", "type": "N-Gram", "description": "Character n-gram F-score. More robust than word-level metrics for morphologically rich languages."},
|
| 74 |
+
{"id": "bleu", "name": "BLEU", "type": "N-Gram", "description": "Bilingual Evaluation Understudy. Precision-based n-gram overlap metric."},
|
| 75 |
+
]
|
| 76 |
+
}
|
utils.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utilities for lazy loading models and helpers
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
# Lazy-loaded model caches
|
| 6 |
+
_rouge_scorer = None
|
| 7 |
+
_comet_model = None
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def get_rouge():
|
| 11 |
+
"""Get or create ROUGE scorer (lazy loaded)"""
|
| 12 |
+
global _rouge_scorer
|
| 13 |
+
|
| 14 |
+
if _rouge_scorer is None:
|
| 15 |
+
from rouge_score import rouge_scorer
|
| 16 |
+
|
| 17 |
+
_rouge_scorer = rouge_scorer.RougeScorer(
|
| 18 |
+
["rouge1", "rouge2", "rougeL"],
|
| 19 |
+
use_stemmer=False
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
return _rouge_scorer
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_comet():
|
| 26 |
+
"""Get or create COMET model (lazy loaded)"""
|
| 27 |
+
|
| 28 |
+
global _comet_model
|
| 29 |
+
|
| 30 |
+
if _comet_model is None:
|
| 31 |
+
try:
|
| 32 |
+
from comet import download_model, load_from_checkpoint
|
| 33 |
+
|
| 34 |
+
print("Loading COMET model... (first run may download)")
|
| 35 |
+
|
| 36 |
+
model_path = download_model("Unbabel/wmt20-comet-da")
|
| 37 |
+
|
| 38 |
+
_comet_model = load_from_checkpoint(model_path)
|
| 39 |
+
|
| 40 |
+
print("COMET model loaded successfully")
|
| 41 |
+
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"COMET not available: {e}")
|
| 44 |
+
_comet_model = None
|
| 45 |
+
|
| 46 |
+
return _comet_model
|