Spaces:

johnnydang88
/

QWEN3

Sleeping

App Files Files Community

johnnydang88 commited on Mar 6

Commit

33989d0

verified ·

1 Parent(s): 9d3ba92

Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
2024ESC-compressed.pdf +3 -0
README.md +20 -8
app.py +227 -0
requirements.txt +11 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+2024ESC-compressed.pdf filter=lfs diff=lfs merge=lfs -text

2024ESC-compressed.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2162e8eacffe412cad0fcde8ab143f7960c80341319677b118362d2d7783f7c5
+size 2446819

README.md CHANGED Viewed

@@ -1,14 +1,26 @@
 ---
-title: QWEN3
-emoji: 👀
-colorFrom: red
-colorTo: green
 sdk: gradio
-sdk_version: 6.8.0
-python_version: '3.12'
 app_file: app.py
 pinned: false
-short_description: NLP RAG QWEN3 ESC GUIDELINES 2024
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Cardiology AI - Llama3
+emoji: 🩺
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: "5.25.0"
 app_file: app.py
 pinned: false
+hardware: zero-a10g
+secrets:
+  - HF_TOKEN
 ---
+# 🩺 Cardiology AI Assistant — Llama-3-8B
+RAG-based cardiology Q&A over the **2024 ESC Guidelines**.
+- **Retriever:** MedCPT (CPU)
+- **Reranker:** BAAI/bge-reranker-base
+- **Generator:** meta-llama/Meta-Llama-3-8B-Instruct (ZeroGPU)
+## Setup
+1. Upload `2024ESC-compressed.pdf` to the Space repo root.
+2. Add `HF_TOKEN` in **Settings → Secrets** (Llama3 is a gated model).
+3. Hardware: ZeroGPU (requires HF Pro).

app.py ADDED Viewed

	@@ -0,0 +1,227 @@

+"""
+Cardiology AI Assistant — Meta Llama-3-8B-Instruct
+Hugging Face ZeroGPU Space (free shared A100)
+ZeroGPU rules applied:
+  - No bitsandbytes quantization (can't load 4-bit without CUDA at init time)
+  - Model loads to CPU at startup in float16
+  - @spaces.GPU decorator borrows GPU only during inference
+  - Reranker also moved to GPU only inside @spaces.GPU function
+"""
+import os, gc, time, torch, warnings, pdfplumber
+import spaces                               # ← ZeroGPU magic
+from typing import List
+from huggingface_hub import login
+from langchain_core.documents import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from langchain_core.embeddings import Embeddings
+from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
+from sentence_transformers import CrossEncoder, SentenceTransformer
+import gradio as gr
+warnings.filterwarnings("ignore")
+# ── Auth ──────────────────────────────────────────────────────────────────────
+HF_TOKEN = os.getenv("HF_TOKEN")
+if HF_TOKEN:
+    login(token=HF_TOKEN)
+PDF_PATH = "./2024ESC-compressed.pdf"
+# ══════════════════════════════════════════════════════════════════════════════
+# PDF LOADER
+# ══════════════════════════════════════════════════════════════════════════════
+def load_pdf_smart(path):
+    print(f"📂 Loading {path}...")
+    docs = []
+    with pdfplumber.open(path) as pdf:
+        for i, page in enumerate(pdf.pages):
+            text = page.extract_text() or ""
+            tables = page.extract_tables()
+            table_str = ""
+            if tables:
+                for t in tables:
+                    table_str += "\n" + "\n".join(
+                        ["| " + " | ".join([str(c).replace("\n", " ") if c else "" for c in row]) + " |"
+                         for row in t]
+                    )
+            docs.append(Document(
+                page_content=f"{text}\n{table_str}",
+                metadata={"page": i + 1, "source": os.path.basename(path)}
+            ))
+    return docs
+# ══════════════════════════════════════════════════════════════════════════════
+# MEDCPT EMBEDDINGS  (CPU — embeddings don't need GPU)
+# ══════════════════════════════════════════════════════════════════════════════
+class MedCPTEmbeddings(Embeddings):
+    def __init__(self, load_article_encoder=True):
+        self.device = "cpu"          # Keep on CPU; no GPU needed for indexing
+        self.models = {
+            "qry_tok": AutoTokenizer.from_pretrained("ncbi/MedCPT-Query-Encoder"),
+            "qry_mod": AutoModel.from_pretrained("ncbi/MedCPT-Query-Encoder"),
+        }
+        if load_article_encoder:
+            self.models["art_tok"] = AutoTokenizer.from_pretrained("ncbi/MedCPT-Article-Encoder")
+            self.models["art_mod"] = AutoModel.from_pretrained("ncbi/MedCPT-Article-Encoder")
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        all_embeddings = []
+        for i in range(0, len(texts), 8):
+            batch = texts[i: i + 8]
+            inputs = self.models["art_tok"](
+                batch, max_length=512, padding=True, truncation=True, return_tensors="pt"
+            )
+            with torch.no_grad():
+                out = self.models["art_mod"](**inputs)
+                all_embeddings.extend(out.last_hidden_state[:, 0, :].tolist())
+        return all_embeddings
+    def embed_query(self, text: str) -> List[float]:
+        inputs = self.models["qry_tok"](
+            [text], max_length=512, padding=True, truncation=True, return_tensors="pt"
+        )
+        with torch.no_grad():
+            out = self.models["qry_mod"](**inputs)
+            return out.last_hidden_state[:, 0, :][0].tolist()
+    def unload_article_encoder(self):
+        if "art_mod" in self.models:
+            del self.models["art_mod"], self.models["art_tok"]
+            gc.collect()
+# ══════════════════════════════════════════════════════════════════════════════
+# STARTUP  —  all loading happens on CPU; no GPU needed here
+# ═══════════���══════════════════════════════════════════════════════════════════
+print("📂 Loading PDF...")
+raw_docs = load_pdf_smart(PDF_PATH)
+print("✂️  Splitting documents...")
+splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
+chunks = splitter.split_documents(raw_docs)
+print("🧠 Building MedCPT vector store (CPU)...")
+emb = MedCPTEmbeddings(load_article_encoder=True)
+vectorstore = FAISS.from_documents(chunks, emb)
+emb.unload_article_encoder()
+print("✅ Vector store ready.")
+# Reranker and metric model stay on CPU at init; reranker is moved to GPU per call
+print("⚖️  Loading CrossEncoder (CPU init)...")
+reranker = CrossEncoder("BAAI/bge-reranker-base", device="cpu")
+print("⚙️  Loading Llama-3-8B in float16 (CPU)...")
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
+# Load to CPU in float16 — ZeroGPU will give us an A100 during inference
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float16,
+    low_cpu_mem_usage=True,
+    token=HF_TOKEN,
+)
+model.eval()
+terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
+print("✅ Llama-3 ready (CPU). GPU will be borrowed per request via ZeroGPU.")
+# ══════════════════════════════════════════════════════════════════════════════
+# GPU FUNCTIONS  —  decorated with @spaces.GPU
+# ══════════════════════════════════════════════════════════════════════════════
+@spaces.GPU
+def rerank_docs(query: str, docs):
+    """Rerank retrieved docs on GPU."""
+    reranker.model.to("cuda")
+    scores = reranker.predict([[query, d.page_content] for d in docs])
+    reranker.model.to("cpu")
+    torch.cuda.empty_cache()
+    return scores
+@spaces.GPU
+def llm_generate(prompt: str) -> str:
+    """Run Llama-3 inference on GPU."""
+    model.to("cuda")
+    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+    with torch.no_grad():
+        output = model.generate(
+            **inputs,
+            max_new_tokens=350,
+            temperature=0.1,
+            eos_token_id=terminators,
+            do_sample=True,
+        )
+    response = tokenizer.decode(output[0], skip_special_tokens=True).split("assistant")[-1].strip()
+    del inputs, output
+    model.to("cpu")
+    torch.cuda.empty_cache()
+    return response
+# ══════════════════════════════════════════════════════════════════════════════
+# RAG PIPELINE  (streaming status updates, GPU only where needed)
+# ══════════════════════════════════════════════════════════════════════════════
+def get_answer(query: str):
+    yield "⏳ **Status:** 🔍 Retrieving documents from VectorDB...\n\n---\n"
+    initial_docs = vectorstore.similarity_search(query, k=15)
+    yield "⏳ **Status:** 📊 Reranking with CrossEncoder (ZeroGPU)...\n\n---\n"
+    scores = rerank_docs(query, initial_docs)
+    top_results = sorted(zip(initial_docs, scores), key=lambda x: x[1], reverse=True)[:5]
+    top_docs = [d for d, _ in top_results]
+    context, pages = "", []
+    for d in top_docs:
+        p = str(d.metadata.get("page", "?"))
+        if p not in pages:
+            pages.append(p)
+        context += f"[Page {p}]\n{d.page_content}\n\n"
+    yield "⏳ **Status:** 🧠 Generating with Llama-3 (ZeroGPU A100)...\n\n---\n"
+    prompt = (
+        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
+        "You are a Cardiology Assistant. Answer based ONLY on the context. "
+        "Be concise and cite page numbers.<|eot_id|>"
+        "<|start_header_id|>user<|end_header_id|>\n"
+        f"Context: {context}\nQuestion: {query}"
+        "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+    )
+    response = llm_generate(prompt)
+    yield f"### 🩺 Answer\n\n{response}\n\n📄 **Source Pages:** {', '.join(pages)}\n"
+# ══════════════════════════════════════════════════════════════════════════════
+# GRADIO UI
+# ══════════════════════════════════════════════════════════════════════════════
+def gradio_wrapper(query):
+    if not query or not query.strip():
+        yield "⚠️ Please enter a valid question."
+        return
+    yield from get_answer(query)
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🩺 Cardiology AI Assistant (ESC 2024)")
+    gr.Markdown("### ⚡ Powered by Meta Llama-3-8B-Instruct · HF ZeroGPU")
+    gr.Markdown(
+        "Ask questions based on the **2024 ESC Medical Guidelines**. "
+        "Uses RAG with MedCPT embeddings, CrossEncoder reranking, and Llama-3-8B generation."
+    )
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(
+                label="Your Question",
+                placeholder="e.g., What are the class I recommendations for anticoagulation in AF?",
+                lines=3,
+            )
+            submit_btn = gr.Button("Analyze Guidelines", variant="primary")
+    output_text = gr.Markdown(label="Assistant Response")
+    gr.Examples(
+        examples=[
+            "What are the class I recommendations for anticoagulation in AF?",
+            "Summarize the treatment algorithm for chronic heart failure.",
+            "What is the target LDL-C for very high-risk patients?",
+        ],
+        inputs=input_text,
+    )
+    submit_btn.click(gradio_wrapper, inputs=input_text, outputs=output_text)
+demo.queue().launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+transformers>=4.41.2
+accelerate
+langchain
+langchain-community
+langchain-core
+langchain-text-splitters
+faiss-cpu
+sentence-transformers
+pdfplumber
+torch
+huggingface_hub