Spaces:
Sleeping
Sleeping
| import socket | |
| import logging | |
| import sacrebleu | |
| from datasets import load_dataset | |
| from tqdm import tqdm | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| def translate_via_tcp(text: str, host='127.0.0.1', port=18080) -> str: | |
| """Send a single english string to the C++ server via TCP and return the Italian result.""" | |
| try: | |
| with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | |
| s.connect((host, port)) | |
| # Send english text (newline framing matches TCP server; enables long lines) | |
| s.sendall(text.encode('utf-8') + b'\n') | |
| # Receive Italian result | |
| data = s.recv(1024) | |
| return data.decode('utf-8') | |
| except Exception as e: | |
| logging.error(f"Error communicating with TCP server: {e}") | |
| return "" | |
| def main(): | |
| logging.info("Starting NMT Accuracy Evaluation...") | |
| # We will use the opus_books English -> Italian dataset. | |
| logging.info("Downloading OPUS Books Eng-Ita dataset...") | |
| try: | |
| # load train split (contains ~32k sentences) | |
| dataset = load_dataset("opus_books", "en-it", split="train") | |
| except Exception as e: | |
| logging.error(f"Failed to load dataset: {e}") | |
| return | |
| # Extract up to 1000 sentence pairs for the evaluation test | |
| max_sentences = 1000 | |
| english_sentences = [] | |
| reference_italian = [] | |
| for item in dataset["translation"][:max_sentences]: | |
| if "en" in item and "it" in item: | |
| english_sentences.append(item["en"]) | |
| reference_italian.append(item["it"]) | |
| total_sentences = len(english_sentences) | |
| logging.info(f"Loaded {total_sentences} sentences to evaluate.") | |
| predictions = [] | |
| # Loop over English, send to TCP Server, fetch Italian | |
| logging.info("Connecting to C++ TCP Engine and processing...") | |
| for eng in tqdm(english_sentences, desc="Translating"): | |
| # The C++ pipeline takes the raw text. | |
| # (It adds the source language token and </s> automatically inside NMTWrapper.cpp) | |
| ita = translate_via_tcp(eng) | |
| predictions.append(ita) | |
| if len(predictions) != total_sentences: | |
| logging.error("Mismatch between predictions and references. Aborting scoring.") | |
| return | |
| logging.info("Calculating BLEU and chrF++ scores via SacreBLEU...") | |
| # Exact Match + BLEU | |
| # SacreBLEU takes a list of candidate strings, and a list of lists of reference strings. | |
| refs = [reference_italian] | |
| bleu = sacrebleu.corpus_bleu(predictions, refs) | |
| chrf = sacrebleu.corpus_chrf(predictions, refs) | |
| logging.info(f"Final BLEU Score: {bleu.score:.2f}") | |
| logging.info(f"Final chrF++ Score: {chrf.score:.2f}") | |
| # Write to Report | |
| report_path = "evaluation_report.txt" | |
| with open(report_path, "w", encoding='utf-8') as f: | |
| f.write("=== NMT-MenKan English-to-Italian Accuracy Evaluation ===\n") | |
| f.write("Dataset: OPUS Books (en-it)\n") | |
| f.write(f"Total Sentences Tested: {total_sentences}\n\n") | |
| f.write("--- Metrics ---\n") | |
| f.write(f"BLEU Score: {bleu.score:.2f}\n") | |
| f.write(f"{bleu.format()}\n\n") | |
| f.write(f"chrF++ Score: {chrf.score:.2f}\n") | |
| f.write(f"{chrf.format()}\n\n") | |
| f.write("--- Sample Outputs (First 5) ---\n") | |
| for i in range(min(5, len(predictions))): | |
| f.write(f"ENG (Source) : {english_sentences[i]}\n") | |
| f.write(f"ITA (Predict): {predictions[i]}\n") | |
| f.write(f"ITA (Target) : {reference_italian[i]}\n") | |
| f.write("-" * 40 + "\n") | |
| logging.info(f"Evaluation report successfully saved to {report_path}") | |
| if __name__ == "__main__": | |
| main() | |