VIKRAM989 commited on
Commit
e5abe38
·
1 Parent(s): f234040

Add application file

Browse files
Files changed (14) hide show
  1. .dockerignore +10 -0
  2. .gitignore +26 -0
  3. Dockerfile +34 -0
  4. main.py +61 -0
  5. metrics/__init__.py +16 -0
  6. metrics/bertscore.py +48 -0
  7. metrics/bleu.py +27 -0
  8. metrics/chrf.py +27 -0
  9. metrics/comet.py +53 -0
  10. metrics/rouge.py +35 -0
  11. models.py +40 -0
  12. requirements.txt +17 -0
  13. routes.py +76 -0
  14. utils.py +46 -0
.dockerignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ venv
2
+ __pycache__
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ .git
7
+ .gitignore
8
+ node_modules
9
+ .env
10
+ .cache
.gitignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ node_modules/
3
+ dist/
4
+ build/
5
+ *.log
6
+ .env
7
+ .env.local
8
+ __pycache__/
9
+ *.py[cod]
10
+ *$py.class
11
+ *.so
12
+ .Python
13
+ env/
14
+ venv/
15
+ ENV/
16
+ .vscode/
17
+ .idea/
18
+ *.swp
19
+ *.swo
20
+ *~
21
+ .pytest_cache/
22
+ .coverage
23
+ htmlcov/
24
+ .venv
25
+ pip-log.txt
26
+ pip-delete-this-directory.txt
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python base image
2
+ FROM python:3.12-slim
3
+
4
+ # Prevent Python from writing .pyc files
5
+ ENV PYTHONDONTWRITEBYTECODE=1
6
+
7
+ # Prevent Python from buffering stdout/stderr
8
+ ENV PYTHONUNBUFFERED=1
9
+
10
+ # Set working directory
11
+ WORKDIR /app
12
+
13
+ # Install system dependencies needed by some ML libraries
14
+ RUN apt-get update && apt-get install -y \
15
+ build-essential \
16
+ git \
17
+ && rm -rf /var/lib/apt/lists/*
18
+
19
+ # Copy requirements first (better caching)
20
+ COPY requirements.txt .
21
+
22
+ RUN pip install --upgrade pip
23
+ RUN pip install setuptools==69.5.1
24
+ RUN pip install --no-cache-dir -r requirements.txt
25
+
26
+ RUN python -c "from comet import download_model; download_model('Unbabel/wmt20-comet-da')"
27
+ # Copy project files
28
+ COPY . .
29
+
30
+ # Expose API port
31
+ EXPOSE 8000
32
+
33
+ # Run FastAPI
34
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
main.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Real-Time Multilingual Metric Evaluator — Backend
3
+ FastAPI server that computes ROUGE, BERTScore, and COMET scores live.
4
+
5
+ Install dependencies:
6
+ pip install -r requirements.txt
7
+
8
+ Run:
9
+ uvicorn main:app --reload --port 8000
10
+ """
11
+
12
+ from fastapi import FastAPI
13
+ from fastapi.middleware.cors import CORSMiddleware
14
+ from fastapi.responses import JSONResponse
15
+ from fastapi.exceptions import RequestValidationError
16
+ from routes import router
17
+
18
+ # Create FastAPI app
19
+ app = FastAPI(title="Multilingual Metric Evaluator API")
20
+
21
+ # Allow requests from the React frontend and local dev
22
+ app.add_middleware(
23
+ CORSMiddleware,
24
+ allow_origins=[
25
+ "https://metric-evaluator.vercel.app",
26
+ "http://localhost:5173",
27
+ "http://localhost:3000",
28
+ "http://127.0.0.1:5173",
29
+ "http://127.0.0.1:3000",
30
+ ],
31
+ allow_credentials=True,
32
+ allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
33
+ allow_headers=["*"],
34
+ max_age=3600,
35
+ )
36
+
37
+ # Custom exception handler to ensure CORS headers are always sent
38
+ @app.exception_handler(Exception)
39
+ async def global_exception_handler(request, exc):
40
+ return JSONResponse(
41
+ status_code=500,
42
+ content={"detail": str(exc)},
43
+ headers={
44
+ "Access-Control-Allow-Origin": request.headers.get("origin", "*"),
45
+ "Access-Control-Allow-Methods": "GET, POST, PUT, DELETE, OPTIONS",
46
+ "Access-Control-Allow-Headers": "*",
47
+ }
48
+ )
49
+
50
+ # health check endpoint
51
+ @app.get("/health")
52
+ def health():
53
+ return {"status": "ok"}
54
+
55
+ # Include routes
56
+ app.include_router(router)
57
+
58
+
59
+ if __name__ == "__main__":
60
+ import uvicorn
61
+ uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
metrics/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Metrics package for NLP evaluation
3
+ """
4
+ from .rouge import compute_rouge
5
+ from .bertscore import compute_bertscore
6
+ from .comet import compute_comet
7
+ from .chrf import compute_chrf
8
+ from .bleu import compute_bleu
9
+
10
+ __all__ = [
11
+ "compute_rouge",
12
+ "compute_bertscore",
13
+ "compute_comet",
14
+ "compute_chrf",
15
+ "compute_bleu",
16
+ ]
metrics/bertscore.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BERTScore metric computation
3
+ """
4
+ import time
5
+ from models import MetricResult
6
+
7
+
8
+ def compute_bertscore(hypothesis: str, reference: str, language: str) -> MetricResult:
9
+ """Compute BERTScore"""
10
+ t0 = time.time()
11
+ try:
12
+ import bert_score
13
+ # Map common language codes to BERTScore model names
14
+ lang_model_map = {
15
+ "ar": "asafaya/bert-base-arabic",
16
+ "he": "avichr/heBERT",
17
+ "zh": "bert-base-chinese",
18
+ "ja": "cl-tohoku/bert-base-japanese-v3",
19
+ "tr": "dbmdz/bert-base-turkish-cased",
20
+ "es": "dccuchile/bert-base-spanish-wwm-cased",
21
+ }
22
+ model = lang_model_map.get(language, "bert-base-multilingual-cased")
23
+
24
+ P, R, F = bert_score.score(
25
+ [hypothesis], [reference],
26
+ model_type=model,
27
+ lang=language,
28
+ verbose=False,
29
+ )
30
+ f = float(F[0])
31
+ return MetricResult(
32
+ name="BERTScore",
33
+ score=round(f, 4),
34
+ subscores={
35
+ "precision": round(float(P[0]), 4),
36
+ "recall": round(float(R[0]), 4),
37
+ "f1": round(f, 4),
38
+ "model": model,
39
+ },
40
+ duration_ms=round((time.time() - t0) * 1000, 1),
41
+ )
42
+ except Exception as e:
43
+ return MetricResult(
44
+ name="BERTScore",
45
+ score=0.0,
46
+ error=str(e),
47
+ duration_ms=round((time.time() - t0) * 1000, 1)
48
+ )
metrics/bleu.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BLEU metric computation
3
+ """
4
+ import time
5
+ from models import MetricResult
6
+
7
+
8
+ def compute_bleu(hypothesis: str, reference: str) -> MetricResult:
9
+ """BLEU score"""
10
+ t0 = time.time()
11
+ try:
12
+ from sacrebleu.metrics import BLEU
13
+ metric = BLEU(effective_order=True)
14
+ score = metric.sentence_score(hypothesis, [reference]).score / 100.0
15
+ return MetricResult(
16
+ name="BLEU",
17
+ score=round(score, 4),
18
+ subscores={"bleu": round(score, 4)},
19
+ duration_ms=round((time.time() - t0) * 1000, 1),
20
+ )
21
+ except Exception as e:
22
+ return MetricResult(
23
+ name="BLEU",
24
+ score=0.0,
25
+ error=str(e),
26
+ duration_ms=round((time.time() - t0) * 1000, 1)
27
+ )
metrics/chrf.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CHRF metric computation
3
+ """
4
+ import time
5
+ from models import MetricResult
6
+
7
+
8
+ def compute_chrf(hypothesis: str, reference: str) -> MetricResult:
9
+ """Character n-gram F-score"""
10
+ t0 = time.time()
11
+ try:
12
+ from sacrebleu.metrics import CHRF
13
+ metric = CHRF()
14
+ score = metric.sentence_score(hypothesis, [reference]).score / 100.0
15
+ return MetricResult(
16
+ name="CHRF",
17
+ score=round(score, 4),
18
+ subscores={"chrf": round(score, 4)},
19
+ duration_ms=round((time.time() - t0) * 1000, 1),
20
+ )
21
+ except Exception as e:
22
+ return MetricResult(
23
+ name="CHRF",
24
+ score=0.0,
25
+ error=str(e),
26
+ duration_ms=round((time.time() - t0) * 1000, 1)
27
+ )
metrics/comet.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ COMET metric computation
3
+ """
4
+
5
+ import time
6
+ from typing import Optional
7
+ from models import MetricResult
8
+ from utils import get_comet
9
+
10
+
11
+ def compute_comet(hypothesis: str, reference: str, source: Optional[str]) -> MetricResult:
12
+ """Compute COMET score"""
13
+
14
+ t0 = time.time()
15
+
16
+ try:
17
+ model = get_comet()
18
+
19
+ if model is None:
20
+ raise RuntimeError("COMET model not loaded — check installation.")
21
+
22
+ # COMET expects source text
23
+ src = source if source else reference
24
+
25
+ data = [{
26
+ "src": src,
27
+ "mt": hypothesis,
28
+ "ref": reference
29
+ }]
30
+
31
+ output = model.predict(
32
+ data,
33
+ batch_size=1,
34
+ accelerator="cpu",
35
+ progress_bar=False
36
+ )
37
+
38
+ seg_score = float(output["scores"][0])
39
+
40
+ return MetricResult(
41
+ name="COMET",
42
+ score=round(seg_score, 4),
43
+ subscores={"segment_score": round(seg_score, 4)},
44
+ duration_ms=round((time.time() - t0) * 1000, 1),
45
+ )
46
+
47
+ except Exception as e:
48
+ return MetricResult(
49
+ name="COMET",
50
+ score=0.0,
51
+ error=str(e),
52
+ duration_ms=round((time.time() - t0) * 1000, 1),
53
+ )
metrics/rouge.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ROUGE metric computation
3
+ """
4
+ import time
5
+ from models import MetricResult
6
+ from utils import get_rouge
7
+
8
+
9
+ def compute_rouge(hypothesis: str, reference: str) -> MetricResult:
10
+ """Compute ROUGE score"""
11
+ t0 = time.time()
12
+ try:
13
+ scorer = get_rouge()
14
+ scores = scorer.score(reference, hypothesis)
15
+ r1 = scores["rouge1"].fmeasure
16
+ r2 = scores["rouge2"].fmeasure
17
+ rL = scores["rougeL"].fmeasure
18
+ overall = (r1 + r2 + rL) / 3
19
+ return MetricResult(
20
+ name="ROUGE",
21
+ score=round(overall, 4),
22
+ subscores={
23
+ "rouge1": round(r1, 4),
24
+ "rouge2": round(r2, 4),
25
+ "rougeL": round(rL, 4),
26
+ },
27
+ duration_ms=round((time.time() - t0) * 1000, 1),
28
+ )
29
+ except Exception as e:
30
+ return MetricResult(
31
+ name="ROUGE",
32
+ score=0.0,
33
+ error=str(e),
34
+ duration_ms=round((time.time() - t0) * 1000, 1)
35
+ )
models.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pydantic models for request/response validation
3
+ """
4
+
5
+ from pydantic import BaseModel, Field
6
+ from typing import Optional, List, Dict
7
+
8
+
9
+ class EvalRequest(BaseModel):
10
+ """Request model for evaluation endpoint"""
11
+
12
+ hypothesis: str # generated summary
13
+ reference: str # reference summary
14
+ source: Optional[str] = None # original article (used by COMET)
15
+
16
+ language: str = "en"
17
+
18
+ metrics: List[str] = Field(
19
+ default_factory=lambda: ["rouge", "bertscore", "comet"]
20
+ )
21
+
22
+
23
+ class MetricResult(BaseModel):
24
+ """Result for a single metric"""
25
+
26
+ name: str
27
+ score: float
28
+
29
+ subscores: Dict = Field(default_factory=dict)
30
+
31
+ error: Optional[str] = None
32
+ duration_ms: float = 0.0
33
+
34
+
35
+ class EvalResponse(BaseModel):
36
+ """Response model for evaluation endpoint"""
37
+
38
+ results: List[MetricResult]
39
+ language: str
40
+ total_ms: float
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools==69.5.1
2
+
3
+ fastapi==0.111.0
4
+ uvicorn[standard]==0.29.0
5
+
6
+ rouge-score==0.1.2
7
+ bert-score==0.3.13
8
+ unbabel-comet==2.2.7
9
+
10
+ torch==2.2.2 --index-url https://download.pytorch.org/whl/cpu
11
+ transformers==4.38.2
12
+ sacrebleu==2.6.0
13
+
14
+ pydantic>=2.0.0
15
+ sentencepiece
16
+ protobuf
17
+ sympy==1.12
routes.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ API routes for the metric evaluator
3
+ """
4
+ import time
5
+ from fastapi import APIRouter, HTTPException
6
+ from models import EvalRequest, EvalResponse
7
+ from metrics import compute_rouge, compute_bertscore, compute_comet, compute_chrf, compute_bleu
8
+
9
+ router = APIRouter()
10
+
11
+
12
+ @router.get("/")
13
+ def root():
14
+ """Root endpoint"""
15
+ return {"status": "ok", "message": "Multilingual Metric Evaluator API"}
16
+
17
+
18
+ @router.get("/health")
19
+ def health():
20
+ """Health check endpoint"""
21
+ return {"status": "healthy"}
22
+
23
+
24
+ @router.post("/evaluate", response_model=EvalResponse)
25
+ def evaluate(req: EvalRequest):
26
+ """Main evaluation endpoint"""
27
+ t_total = time.time()
28
+ results = []
29
+
30
+ metric_set = set(m.lower() for m in req.metrics)
31
+
32
+ try:
33
+ if "rouge" in metric_set:
34
+ results.append(compute_rouge(req.hypothesis, req.reference))
35
+
36
+ if "bertscore" in metric_set:
37
+ try:
38
+ results.append(compute_bertscore(req.hypothesis, req.reference, req.language))
39
+ except Exception as e:
40
+ raise HTTPException(status_code=500, detail=f"BERTScore error: {str(e)}")
41
+
42
+ if "comet" in metric_set:
43
+ try:
44
+ results.append(compute_comet(req.hypothesis, req.reference, req.source))
45
+ except Exception as e:
46
+ raise HTTPException(status_code=500, detail=f"COMET error: {str(e)}")
47
+
48
+ if "chrf" in metric_set:
49
+ results.append(compute_chrf(req.hypothesis, req.reference))
50
+
51
+ if "bleu" in metric_set:
52
+ results.append(compute_bleu(req.hypothesis, req.reference))
53
+
54
+ return EvalResponse(
55
+ results=results,
56
+ language=req.language,
57
+ total_ms=round((time.time() - t_total) * 1000, 1),
58
+ )
59
+ except HTTPException:
60
+ raise
61
+ except Exception as e:
62
+ raise HTTPException(status_code=500, detail=f"Evaluation error: {str(e)}")
63
+
64
+
65
+ @router.get("/metrics")
66
+ def list_metrics():
67
+ """Returns all available metrics and their descriptions"""
68
+ return {
69
+ "metrics": [
70
+ {"id": "rouge", "name": "ROUGE", "type": "N-Gram", "description": "Recall-Oriented Understudy for Gisting Evaluation. Measures n-gram overlap between hypothesis and reference."},
71
+ {"id": "bertscore", "name": "BERTScore", "type": "Neural", "description": "Computes similarity using BERT token embeddings. Uses language-specific models when available."},
72
+ {"id": "comet", "name": "COMET", "type": "Neural", "description": "Trained neural metric with human quality score regression. Best overall correlation across language families."},
73
+ {"id": "chrf", "name": "CHRF", "type": "N-Gram", "description": "Character n-gram F-score. More robust than word-level metrics for morphologically rich languages."},
74
+ {"id": "bleu", "name": "BLEU", "type": "N-Gram", "description": "Bilingual Evaluation Understudy. Precision-based n-gram overlap metric."},
75
+ ]
76
+ }
utils.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities for lazy loading models and helpers
3
+ """
4
+
5
+ # Lazy-loaded model caches
6
+ _rouge_scorer = None
7
+ _comet_model = None
8
+
9
+
10
+ def get_rouge():
11
+ """Get or create ROUGE scorer (lazy loaded)"""
12
+ global _rouge_scorer
13
+
14
+ if _rouge_scorer is None:
15
+ from rouge_score import rouge_scorer
16
+
17
+ _rouge_scorer = rouge_scorer.RougeScorer(
18
+ ["rouge1", "rouge2", "rougeL"],
19
+ use_stemmer=False
20
+ )
21
+
22
+ return _rouge_scorer
23
+
24
+
25
+ def get_comet():
26
+ """Get or create COMET model (lazy loaded)"""
27
+
28
+ global _comet_model
29
+
30
+ if _comet_model is None:
31
+ try:
32
+ from comet import download_model, load_from_checkpoint
33
+
34
+ print("Loading COMET model... (first run may download)")
35
+
36
+ model_path = download_model("Unbabel/wmt20-comet-da")
37
+
38
+ _comet_model = load_from_checkpoint(model_path)
39
+
40
+ print("COMET model loaded successfully")
41
+
42
+ except Exception as e:
43
+ print(f"COMET not available: {e}")
44
+ _comet_model = None
45
+
46
+ return _comet_model