johnnydang88 commited on
Commit
33989d0
Β·
verified Β·
1 Parent(s): 9d3ba92

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. 2024ESC-compressed.pdf +3 -0
  3. README.md +20 -8
  4. app.py +227 -0
  5. requirements.txt +11 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ 2024ESC-compressed.pdf filter=lfs diff=lfs merge=lfs -text
2024ESC-compressed.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2162e8eacffe412cad0fcde8ab143f7960c80341319677b118362d2d7783f7c5
3
+ size 2446819
README.md CHANGED
@@ -1,14 +1,26 @@
1
  ---
2
- title: QWEN3
3
- emoji: πŸ‘€
4
- colorFrom: red
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 6.8.0
8
- python_version: '3.12'
9
  app_file: app.py
10
  pinned: false
11
- short_description: NLP RAG QWEN3 ESC GUIDELINES 2024
 
 
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Cardiology AI - Llama3
3
+ emoji: 🩺
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: "5.25.0"
 
8
  app_file: app.py
9
  pinned: false
10
+ hardware: zero-a10g
11
+ secrets:
12
+ - HF_TOKEN
13
  ---
14
 
15
+ # 🩺 Cardiology AI Assistant β€” Llama-3-8B
16
+
17
+ RAG-based cardiology Q&A over the **2024 ESC Guidelines**.
18
+
19
+ - **Retriever:** MedCPT (CPU)
20
+ - **Reranker:** BAAI/bge-reranker-base
21
+ - **Generator:** meta-llama/Meta-Llama-3-8B-Instruct (ZeroGPU)
22
+
23
+ ## Setup
24
+ 1. Upload `2024ESC-compressed.pdf` to the Space repo root.
25
+ 2. Add `HF_TOKEN` in **Settings β†’ Secrets** (Llama3 is a gated model).
26
+ 3. Hardware: ZeroGPU (requires HF Pro).
app.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cardiology AI Assistant β€” Meta Llama-3-8B-Instruct
3
+ Hugging Face ZeroGPU Space (free shared A100)
4
+
5
+ ZeroGPU rules applied:
6
+ - No bitsandbytes quantization (can't load 4-bit without CUDA at init time)
7
+ - Model loads to CPU at startup in float16
8
+ - @spaces.GPU decorator borrows GPU only during inference
9
+ - Reranker also moved to GPU only inside @spaces.GPU function
10
+ """
11
+
12
+ import os, gc, time, torch, warnings, pdfplumber
13
+ import spaces # ← ZeroGPU magic
14
+ from typing import List
15
+ from huggingface_hub import login
16
+ from langchain_core.documents import Document
17
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
18
+ from langchain_community.vectorstores import FAISS
19
+ from langchain_core.embeddings import Embeddings
20
+ from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
21
+ from sentence_transformers import CrossEncoder, SentenceTransformer
22
+ import gradio as gr
23
+
24
+ warnings.filterwarnings("ignore")
25
+
26
+ # ── Auth ──────────────────────────────────────────────────────────────────────
27
+ HF_TOKEN = os.getenv("HF_TOKEN")
28
+ if HF_TOKEN:
29
+ login(token=HF_TOKEN)
30
+
31
+ PDF_PATH = "./2024ESC-compressed.pdf"
32
+
33
+ # ══════════════════════════════════════════════════════════════════════════════
34
+ # PDF LOADER
35
+ # ══════════════════════════════════════════════════════════════════════════════
36
+ def load_pdf_smart(path):
37
+ print(f"πŸ“‚ Loading {path}...")
38
+ docs = []
39
+ with pdfplumber.open(path) as pdf:
40
+ for i, page in enumerate(pdf.pages):
41
+ text = page.extract_text() or ""
42
+ tables = page.extract_tables()
43
+ table_str = ""
44
+ if tables:
45
+ for t in tables:
46
+ table_str += "\n" + "\n".join(
47
+ ["| " + " | ".join([str(c).replace("\n", " ") if c else "" for c in row]) + " |"
48
+ for row in t]
49
+ )
50
+ docs.append(Document(
51
+ page_content=f"{text}\n{table_str}",
52
+ metadata={"page": i + 1, "source": os.path.basename(path)}
53
+ ))
54
+ return docs
55
+
56
+ # ══════════════════════════════════════════════════════════════════════════════
57
+ # MEDCPT EMBEDDINGS (CPU β€” embeddings don't need GPU)
58
+ # ══════════════════════════════════════════════════════════════════════════════
59
+ class MedCPTEmbeddings(Embeddings):
60
+ def __init__(self, load_article_encoder=True):
61
+ self.device = "cpu" # Keep on CPU; no GPU needed for indexing
62
+ self.models = {
63
+ "qry_tok": AutoTokenizer.from_pretrained("ncbi/MedCPT-Query-Encoder"),
64
+ "qry_mod": AutoModel.from_pretrained("ncbi/MedCPT-Query-Encoder"),
65
+ }
66
+ if load_article_encoder:
67
+ self.models["art_tok"] = AutoTokenizer.from_pretrained("ncbi/MedCPT-Article-Encoder")
68
+ self.models["art_mod"] = AutoModel.from_pretrained("ncbi/MedCPT-Article-Encoder")
69
+
70
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
71
+ all_embeddings = []
72
+ for i in range(0, len(texts), 8):
73
+ batch = texts[i: i + 8]
74
+ inputs = self.models["art_tok"](
75
+ batch, max_length=512, padding=True, truncation=True, return_tensors="pt"
76
+ )
77
+ with torch.no_grad():
78
+ out = self.models["art_mod"](**inputs)
79
+ all_embeddings.extend(out.last_hidden_state[:, 0, :].tolist())
80
+ return all_embeddings
81
+
82
+ def embed_query(self, text: str) -> List[float]:
83
+ inputs = self.models["qry_tok"](
84
+ [text], max_length=512, padding=True, truncation=True, return_tensors="pt"
85
+ )
86
+ with torch.no_grad():
87
+ out = self.models["qry_mod"](**inputs)
88
+ return out.last_hidden_state[:, 0, :][0].tolist()
89
+
90
+ def unload_article_encoder(self):
91
+ if "art_mod" in self.models:
92
+ del self.models["art_mod"], self.models["art_tok"]
93
+ gc.collect()
94
+
95
+ # ══════════════════════════════════════════════════════════════════════════════
96
+ # STARTUP β€” all loading happens on CPU; no GPU needed here
97
+ # ═══════════���══════════════════════════════════════════════════════════════════
98
+ print("πŸ“‚ Loading PDF...")
99
+ raw_docs = load_pdf_smart(PDF_PATH)
100
+
101
+ print("βœ‚οΈ Splitting documents...")
102
+ splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
103
+ chunks = splitter.split_documents(raw_docs)
104
+
105
+ print("🧠 Building MedCPT vector store (CPU)...")
106
+ emb = MedCPTEmbeddings(load_article_encoder=True)
107
+ vectorstore = FAISS.from_documents(chunks, emb)
108
+ emb.unload_article_encoder()
109
+ print("βœ… Vector store ready.")
110
+
111
+ # Reranker and metric model stay on CPU at init; reranker is moved to GPU per call
112
+ print("βš–οΈ Loading CrossEncoder (CPU init)...")
113
+ reranker = CrossEncoder("BAAI/bge-reranker-base", device="cpu")
114
+
115
+ print("βš™οΈ Loading Llama-3-8B in float16 (CPU)...")
116
+ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
117
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
118
+ # Load to CPU in float16 β€” ZeroGPU will give us an A100 during inference
119
+ model = AutoModelForCausalLM.from_pretrained(
120
+ MODEL_ID,
121
+ torch_dtype=torch.float16,
122
+ low_cpu_mem_usage=True,
123
+ token=HF_TOKEN,
124
+ )
125
+ model.eval()
126
+ terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
127
+ print("βœ… Llama-3 ready (CPU). GPU will be borrowed per request via ZeroGPU.")
128
+
129
+ # ══════════════════════════════════════════════════════════════════════════════
130
+ # GPU FUNCTIONS β€” decorated with @spaces.GPU
131
+ # ══════════════════════════════════════════════════════════════════════════════
132
+
133
+ @spaces.GPU
134
+ def rerank_docs(query: str, docs):
135
+ """Rerank retrieved docs on GPU."""
136
+ reranker.model.to("cuda")
137
+ scores = reranker.predict([[query, d.page_content] for d in docs])
138
+ reranker.model.to("cpu")
139
+ torch.cuda.empty_cache()
140
+ return scores
141
+
142
+ @spaces.GPU
143
+ def llm_generate(prompt: str) -> str:
144
+ """Run Llama-3 inference on GPU."""
145
+ model.to("cuda")
146
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
147
+ with torch.no_grad():
148
+ output = model.generate(
149
+ **inputs,
150
+ max_new_tokens=350,
151
+ temperature=0.1,
152
+ eos_token_id=terminators,
153
+ do_sample=True,
154
+ )
155
+ response = tokenizer.decode(output[0], skip_special_tokens=True).split("assistant")[-1].strip()
156
+ del inputs, output
157
+ model.to("cpu")
158
+ torch.cuda.empty_cache()
159
+ return response
160
+
161
+ # ══════════════════════════════════════════════════════════════════════════════
162
+ # RAG PIPELINE (streaming status updates, GPU only where needed)
163
+ # ══════════════════════════════════════════════════════════════════════════════
164
+ def get_answer(query: str):
165
+ yield "⏳ **Status:** πŸ” Retrieving documents from VectorDB...\n\n---\n"
166
+ initial_docs = vectorstore.similarity_search(query, k=15)
167
+
168
+ yield "⏳ **Status:** πŸ“Š Reranking with CrossEncoder (ZeroGPU)...\n\n---\n"
169
+ scores = rerank_docs(query, initial_docs)
170
+ top_results = sorted(zip(initial_docs, scores), key=lambda x: x[1], reverse=True)[:5]
171
+ top_docs = [d for d, _ in top_results]
172
+
173
+ context, pages = "", []
174
+ for d in top_docs:
175
+ p = str(d.metadata.get("page", "?"))
176
+ if p not in pages:
177
+ pages.append(p)
178
+ context += f"[Page {p}]\n{d.page_content}\n\n"
179
+
180
+ yield "⏳ **Status:** 🧠 Generating with Llama-3 (ZeroGPU A100)...\n\n---\n"
181
+ prompt = (
182
+ "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
183
+ "You are a Cardiology Assistant. Answer based ONLY on the context. "
184
+ "Be concise and cite page numbers.<|eot_id|>"
185
+ "<|start_header_id|>user<|end_header_id|>\n"
186
+ f"Context: {context}\nQuestion: {query}"
187
+ "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
188
+ )
189
+ response = llm_generate(prompt)
190
+ yield f"### 🩺 Answer\n\n{response}\n\nπŸ“„ **Source Pages:** {', '.join(pages)}\n"
191
+
192
+ # ══════════════════════════════════════════════════════════════════════════════
193
+ # GRADIO UI
194
+ # ══════════════════════════════════════════════════════════════════════════════
195
+ def gradio_wrapper(query):
196
+ if not query or not query.strip():
197
+ yield "⚠️ Please enter a valid question."
198
+ return
199
+ yield from get_answer(query)
200
+
201
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
202
+ gr.Markdown("# 🩺 Cardiology AI Assistant (ESC 2024)")
203
+ gr.Markdown("### ⚑ Powered by Meta Llama-3-8B-Instruct · HF ZeroGPU")
204
+ gr.Markdown(
205
+ "Ask questions based on the **2024 ESC Medical Guidelines**. "
206
+ "Uses RAG with MedCPT embeddings, CrossEncoder reranking, and Llama-3-8B generation."
207
+ )
208
+ with gr.Row():
209
+ with gr.Column():
210
+ input_text = gr.Textbox(
211
+ label="Your Question",
212
+ placeholder="e.g., What are the class I recommendations for anticoagulation in AF?",
213
+ lines=3,
214
+ )
215
+ submit_btn = gr.Button("Analyze Guidelines", variant="primary")
216
+ output_text = gr.Markdown(label="Assistant Response")
217
+ gr.Examples(
218
+ examples=[
219
+ "What are the class I recommendations for anticoagulation in AF?",
220
+ "Summarize the treatment algorithm for chronic heart failure.",
221
+ "What is the target LDL-C for very high-risk patients?",
222
+ ],
223
+ inputs=input_text,
224
+ )
225
+ submit_btn.click(gradio_wrapper, inputs=input_text, outputs=output_text)
226
+
227
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers>=4.41.2
2
+ accelerate
3
+ langchain
4
+ langchain-community
5
+ langchain-core
6
+ langchain-text-splitters
7
+ faiss-cpu
8
+ sentence-transformers
9
+ pdfplumber
10
+ torch
11
+ huggingface_hub