moseleydev commited on
Commit
3142c97
·
verified ·
1 Parent(s): a1d4d76

pushed to hugging face

Browse files
Files changed (3) hide show
  1. Dockerfile +20 -0
  2. main.py +93 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ RUN useradd -m -u 1000 user
4
+
5
+ USER user
6
+
7
+ ENV HOME=/home/user \
8
+ PATH=/home/user/.local/bin:$PATH
9
+
10
+ WORKDIR $HOME/app
11
+
12
+ COPY --chown=user requirements.txt .
13
+
14
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
15
+
16
+ COPY --chown=user . .
17
+
18
+ EXPOSE 7860
19
+
20
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ from transformers import AutoTokenizer, AutoModel
5
+ from sklearn.cluster import KMeans
6
+ import torch
7
+ import numpy as np
8
+ import spacy
9
+ import time
10
+
11
+ app = FastAPI(
12
+ title="Clinical Extractive Summarization",
13
+ description="SciBERT + KMeans NLP Engine for Medical Reports"
14
+ )
15
+
16
+ app.add_middleware(
17
+ CORSMiddleware,
18
+ allow_origins=["*"],
19
+ allow_methods=["*"],
20
+ allow_headers=["*"],
21
+ )
22
+
23
+ tokenizer = None
24
+ model = None
25
+ nlp = None
26
+
27
+ class ReportRequest(BaseModel):
28
+ text: str
29
+ num_sentences: int = 3
30
+
31
+ @app.post("/api/summarize")
32
+ def summarize_medical_report(request: ReportRequest):
33
+ start_time = time.time()
34
+
35
+ global tokenizer, model, nlp
36
+ if model is None:
37
+ print("Initializing SciBERT and SpaCy... This takes a moment.")
38
+ # Load SciBERT
39
+ model_name = "allenai/scibert_scivocab_uncased"
40
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
41
+ model = AutoModel.from_pretrained(model_name)
42
+ try:
43
+ nlp = spacy.load("en_core_web_sm")
44
+ except OSError:
45
+ import spacy.cli
46
+ spacy.cli.download("en_core_web_sm")
47
+ nlp = spacy.load("en_core_web_sm")
48
+ print("Models loaded successfully!")
49
+
50
+ # 1. Safely split text into sentences using SpaCy NLP
51
+ doc = nlp(request.text)
52
+ sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 5]
53
+
54
+ # Edge case: Report is too short
55
+ if len(sentences) <= request.num_sentences:
56
+ return {"summary": request.text, "metadata": {"status": "too_short"}}
57
+
58
+ # 2. Get embeddings for each sentence using SciBERT
59
+ embeddings = []
60
+ for sent in sentences:
61
+ inputs = tokenizer(sent, return_tensors="pt", truncation=True, padding=True, max_length=512)
62
+ with torch.no_grad():
63
+ output = model(**inputs)
64
+
65
+ # Extract the [CLS] token representation
66
+ cls_embedding = output.last_hidden_state[0][0].numpy()
67
+ embeddings.append(cls_embedding)
68
+
69
+ # 3. Use KMeans to cluster the embeddings and find the most central sentences
70
+ # n_init='auto' suppresses sklearn warnings
71
+ kmeans = KMeans(n_clusters=request.num_sentences, n_init='auto', random_state=42).fit(embeddings)
72
+
73
+ avg = []
74
+ for i in range(request.num_sentences):
75
+ # Find the sentence closest to the cluster centroid
76
+ idx = np.argmin(np.linalg.norm(embeddings - kmeans.cluster_centers_[i], axis=1))
77
+ avg.append(idx)
78
+
79
+ # 4. Sort indices chronologically to maintain report flow
80
+ avg = sorted(list(set(avg)))
81
+ final_summary = " ".join([sentences[i] for i in avg])
82
+
83
+ process_time = round((time.time() - start_time) * 1000, 2)
84
+
85
+ return {
86
+ "summary": final_summary,
87
+ "metadata": {
88
+ "processing_time_ms": process_time,
89
+ "original_length": len(sentences),
90
+ "summary_length": len(avg),
91
+ "engine": "SciBERT + KMeans"
92
+ }
93
+ }
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pydantic
4
+ transformers
5
+ torch
6
+ scikit-learn
7
+ spacy