from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from transformers import AutoTokenizer, AutoModel import torch import torch.nn as nn import spacy import spacy.cli import time import os app = FastAPI( title="Clinical Extractive Summarization", description="SciBERT + BERTsum Fine-Tuned Engine for Medical Reports" ) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) # --- ARCHITECTURE DEFINITION --- class BioExtractor(nn.Module): def __init__(self, model_name): super(BioExtractor, self).__init__() self.bert = AutoModel.from_pretrained(model_name) # The classification layer that predicts sentence salience [cite: 279] self.classifier = nn.Linear(768, 1) self.sigmoid = nn.Sigmoid() def forward(self, input_ids, attention_mask): outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) cls_output = outputs.last_hidden_state[:, 0, :] return self.sigmoid(self.classifier(cls_output)) # Global variables to cache models in memory tokenizer = None model = None nlp = None device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') class ReportRequest(BaseModel): text: str num_sentences: int = 3 @app.get("/") def health_check(): return { "status": "Engine is running", "message": "Send POST requests to /api/summarize", "docs": "Visit /docs for the Swagger UI" } @app.post("/api/summarize") def summarize_medical_report(request: ReportRequest): start_time = time.time() global tokenizer, model, nlp, device if model is None: print("Initializing Fine-Tuned SciBERT and SpaCy...") # Load the base tokenizer model_name = "allenai/scibert_scivocab_uncased" tokenizer = AutoTokenizer.from_pretrained(model_name) # Instantiate your custom architecture model = BioExtractor(model_name) # Load the trained weights from the uploaded .pt file model_path = "med_summarizer_trained.pt" if os.path.exists(model_path): print(f"Loading fine-tuned weights from {model_path}...") # map_location ensures it works even if Hugging Face runs on a CPU space model.load_state_dict(torch.load(model_path, map_location=device)) else: print(f"WARNING: {model_path} not found! Upload it to your Space.") model.to(device) model.eval() # Lock the model for inference try: nlp = spacy.load("en_core_web_sm") except OSError: print("Downloading SpaCy English model...") spacy.cli.download("en_core_web_sm") nlp = spacy.load("en_core_web_sm") print("Models loaded successfully!") # 1. Safely split text into sentences using SpaCy NLP doc = nlp(request.text) # Filter out extremely short strings just like your Colab script sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10] # Edge case: Report is too short to summarize if len(sentences) <= request.num_sentences: return {"summary": request.text, "metadata": {"status": "too_short"}} # 2. Get probability scores for each sentence using the fine-tuned model scores = [] with torch.no_grad(): for sent in sentences: inputs = tokenizer(sent, return_tensors="pt", truncation=True, padding='max_length', max_length=128).to(device) output = model(inputs['input_ids'], inputs['attention_mask']) scores.append(output.item()) # 3. Rank and select the top N sentences # Enumerate keeps track of the original sentence index (e.g., (index, score)) scored_sentences = sorted(enumerate(scores), key=lambda x: x[1], reverse=True) top_indices = [idx for idx, score in scored_sentences[:request.num_sentences]] # 4. Sort indices chronologically to maintain original report flow [cite: 248] top_indices_sorted = sorted(top_indices) final_summary = " ".join([sentences[i] for i in top_indices_sorted]) process_time = round((time.time() - start_time) * 1000, 2) return { "summary": final_summary, "metadata": { "processing_time_ms": process_time, "original_length": len(sentences), "summary_length": len(top_indices_sorted), "engine": "SciBERT + BERTsum Fine-Tuned" } }