Mahmoud Amiri
commited on
Commit
·
beb558a
1
Parent(s):
7a32d56
first commit
Browse files- README.md +10 -6
- app.py +112 -65
- requirements.txt +7 -0
README.md
CHANGED
|
@@ -1,15 +1,19 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.42.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
hf_oauth: true
|
| 11 |
hf_oauth_scopes:
|
| 12 |
-
- inference-api
|
|
|
|
| 13 |
---
|
| 14 |
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Lit2Vec TL;DR Summarizer
|
| 3 |
+
emoji: 🧪
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.42.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
hf_oauth: true
|
| 11 |
hf_oauth_scopes:
|
| 12 |
+
- inference-api
|
| 13 |
+
short_description: TL;DR summarizer for chemistry research abstracts
|
| 14 |
---
|
| 15 |
|
| 16 |
+
|
| 17 |
+
Lit2Vec TL;DR is an abstractive summarization tool for chemistry research abstracts, built using Gradio and a fine-tuned DistilBART model. It generates concise, structured summaries capturing the **methods**, **results**, and **significance** of scientific papers.
|
| 18 |
+
🔬 Model: [`Bocklitz-Lab/lit2vec-tldr-bart-model`](https://huggingface.co/Bocklitz-Lab/lit2vec-tldr-bart-model)
|
| 19 |
+
|
app.py
CHANGED
|
@@ -1,70 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from huggingface_hub import
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
def respond(
|
| 6 |
-
message,
|
| 7 |
-
history: list[dict[str, str]],
|
| 8 |
-
system_message,
|
| 9 |
-
max_tokens,
|
| 10 |
-
temperature,
|
| 11 |
-
top_p,
|
| 12 |
-
hf_token: gr.OAuthToken,
|
| 13 |
-
):
|
| 14 |
-
"""
|
| 15 |
-
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
|
| 16 |
-
"""
|
| 17 |
-
client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
|
| 18 |
-
|
| 19 |
-
messages = [{"role": "system", "content": system_message}]
|
| 20 |
-
|
| 21 |
-
messages.extend(history)
|
| 22 |
-
|
| 23 |
-
messages.append({"role": "user", "content": message})
|
| 24 |
-
|
| 25 |
-
response = ""
|
| 26 |
-
|
| 27 |
-
for message in client.chat_completion(
|
| 28 |
-
messages,
|
| 29 |
-
max_tokens=max_tokens,
|
| 30 |
-
stream=True,
|
| 31 |
-
temperature=temperature,
|
| 32 |
-
top_p=top_p,
|
| 33 |
-
):
|
| 34 |
-
choices = message.choices
|
| 35 |
-
token = ""
|
| 36 |
-
if len(choices) and choices[0].delta.content:
|
| 37 |
-
token = choices[0].delta.content
|
| 38 |
-
|
| 39 |
-
response += token
|
| 40 |
-
yield response
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
"""
|
| 44 |
-
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
|
| 45 |
-
"""
|
| 46 |
-
chatbot = gr.ChatInterface(
|
| 47 |
-
respond,
|
| 48 |
-
type="messages",
|
| 49 |
-
additional_inputs=[
|
| 50 |
-
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
|
| 51 |
-
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
| 52 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
| 53 |
-
gr.Slider(
|
| 54 |
-
minimum=0.1,
|
| 55 |
-
maximum=1.0,
|
| 56 |
-
value=0.95,
|
| 57 |
-
step=0.05,
|
| 58 |
-
label="Top-p (nucleus sampling)",
|
| 59 |
-
),
|
| 60 |
-
],
|
| 61 |
-
)
|
| 62 |
-
|
| 63 |
-
with gr.Blocks() as demo:
|
| 64 |
-
with gr.Sidebar():
|
| 65 |
-
gr.LoginButton()
|
| 66 |
-
chatbot.render()
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
if __name__ == "__main__":
|
|
|
|
| 70 |
demo.launch()
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import numpy as np
|
| 4 |
import gradio as gr
|
| 5 |
+
from huggingface_hub import hf_hub_download
|
| 6 |
+
from tensorflow import keras
|
| 7 |
+
from sentence_transformers import SentenceTransformer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
# -----------------------
|
| 10 |
+
# Config
|
| 11 |
+
# -----------------------
|
| 12 |
+
REPO_ID = "Bocklitz-Lab/lit2vec-subfield-classifier-model"
|
| 13 |
+
EMBED_MODEL = "intfloat/e5-large-v2"
|
| 14 |
+
TEXT_PREFIX = {"abstract": "abstract: ", "summary": "summary: "}
|
| 15 |
+
DEFAULT_THRESHOLD = 0.5
|
| 16 |
+
TOPK_DEFAULT = 5
|
| 17 |
+
|
| 18 |
+
# -----------------------
|
| 19 |
+
# Load model + labels at startup
|
| 20 |
+
# -----------------------
|
| 21 |
+
# Keras model (saved as .h5 on the Hub)
|
| 22 |
+
MODEL_PATH = hf_hub_download(REPO_ID, filename="mlp_model.h5")
|
| 23 |
+
LABEL_MAP_PATH = hf_hub_download(REPO_ID, filename="label_mapping.json")
|
| 24 |
+
|
| 25 |
+
with open(LABEL_MAP_PATH, "r", encoding="utf-8") as f:
|
| 26 |
+
mapping = json.load(f)
|
| 27 |
+
INDEX_TO_LABEL = {int(k): v for k, v in mapping["index_to_label"].items()}
|
| 28 |
+
|
| 29 |
+
# load Keras model for inference
|
| 30 |
+
MODEL = keras.models.load_model(MODEL_PATH, compile=False)
|
| 31 |
+
|
| 32 |
+
# SentenceTransformer encoder (CPU-only for portability)
|
| 33 |
+
ENCODER = SentenceTransformer(EMBED_MODEL, device="cpu")
|
| 34 |
+
|
| 35 |
+
def encode_text(text: str, text_type: str = "abstract") -> np.ndarray:
|
| 36 |
+
"""Encode text into normalized embedding compatible with the classifier."""
|
| 37 |
+
prefix = TEXT_PREFIX.get(text_type, "")
|
| 38 |
+
emb = ENCODER.encode([prefix + text], normalize_embeddings=True) # shape: (1, D)
|
| 39 |
+
return emb.astype("float32")
|
| 40 |
+
|
| 41 |
+
def predict(text: str, text_type: str, threshold: float, topk: int):
|
| 42 |
+
"""Return selected labels (by threshold), top-k labels, and a scores table."""
|
| 43 |
+
text = (text or "").strip()
|
| 44 |
+
if not text:
|
| 45 |
+
return ("", "", [])
|
| 46 |
+
|
| 47 |
+
X = encode_text(text, text_type=text_type) # (1, D)
|
| 48 |
+
probs = MODEL.predict(X, verbose=0)[0] # (18,)
|
| 49 |
+
|
| 50 |
+
# Thresholded predictions
|
| 51 |
+
pred_ids = [i for i, p in enumerate(probs) if p >= threshold]
|
| 52 |
+
pred_labels = [INDEX_TO_LABEL[i] for i in pred_ids]
|
| 53 |
+
pred_display = ", ".join(pred_labels) if pred_labels else "—"
|
| 54 |
+
|
| 55 |
+
# Top-k predictions (by score)
|
| 56 |
+
topk = max(1, int(topk))
|
| 57 |
+
order = np.argsort(-probs)[:topk]
|
| 58 |
+
topk_items = [f"{INDEX_TO_LABEL[i]}: {probs[i]:.3f}" for i in order]
|
| 59 |
+
topk_display = "\n".join(topk_items)
|
| 60 |
+
|
| 61 |
+
# Build a table of all scores (sorted desc)
|
| 62 |
+
sorted_ids = np.argsort(-probs)
|
| 63 |
+
table = [[INDEX_TO_LABEL[i], float(probs[i])] for i in sorted_ids]
|
| 64 |
+
|
| 65 |
+
return pred_display, topk_display, table
|
| 66 |
+
|
| 67 |
+
# -----------------------
|
| 68 |
+
# Gradio UI
|
| 69 |
+
# -----------------------
|
| 70 |
+
with gr.Blocks(fill_height=True) as demo:
|
| 71 |
+
gr.Markdown(
|
| 72 |
+
"""
|
| 73 |
+
# 🔬 Lit2Vec Subfield Classifier
|
| 74 |
+
Enter a **chemistry abstract or summary**. The app encodes it with `e5-large-v2` and predicts one or more **subfields** using the MLP model.
|
| 75 |
+
|
| 76 |
+
**Model:** `Bocklitz-Lab/lit2vec-subfield-classifier-model`
|
| 77 |
+
**Encoder:** `intfloat/e5-large-v2`
|
| 78 |
+
"""
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
with gr.Row():
|
| 82 |
+
text_type = gr.Radio(
|
| 83 |
+
choices=["abstract", "summary"], value="abstract", label="Text type (prefix used for encoding)"
|
| 84 |
+
)
|
| 85 |
+
threshold = gr.Slider(0.0, 1.0, value=DEFAULT_THRESHOLD, step=0.01, label="Decision threshold")
|
| 86 |
+
topk = gr.Slider(1, 10, value=TOPK_DEFAULT, step=1, label="Top-K to display")
|
| 87 |
+
|
| 88 |
+
input_box = gr.Textbox(
|
| 89 |
+
label="Paste abstract / summary",
|
| 90 |
+
placeholder="Paste your chemistry abstract here…",
|
| 91 |
+
lines=12,
|
| 92 |
+
value="We investigate Pt-based nanoparticle catalysts for hydrogen evolution, "
|
| 93 |
+
"demonstrating improved activity via ligand tuning and support engineering."
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
run_btn = gr.Button("Predict subfield(s)")
|
| 97 |
+
|
| 98 |
+
with gr.Row():
|
| 99 |
+
selected_labels = gr.Textbox(label="Predicted fields (thresholded)", lines=2)
|
| 100 |
+
topk_labels = gr.Textbox(label="Top-K (scores)", lines=6)
|
| 101 |
+
|
| 102 |
+
scores_table = gr.Dataframe(
|
| 103 |
+
headers=["Subfield", "Score"],
|
| 104 |
+
datatype=["str", "number"],
|
| 105 |
+
label="All scores (sorted)",
|
| 106 |
+
interactive=False
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
run_btn.click(
|
| 110 |
+
fn=predict,
|
| 111 |
+
inputs=[input_box, text_type, threshold, topk],
|
| 112 |
+
outputs=[selected_labels, topk_labels, scores_table]
|
| 113 |
+
)
|
| 114 |
|
| 115 |
if __name__ == "__main__":
|
| 116 |
+
# On Spaces, Gradio sets host/port; keep defaults.
|
| 117 |
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==5.42.0
|
| 2 |
+
transformers==4.55.3
|
| 3 |
+
torch==2.8.0
|
| 4 |
+
sentence-transformers==2.7.0
|
| 5 |
+
huggingface_hub==0.23.0
|
| 6 |
+
tensorflow-cpu==2.15.0
|
| 7 |
+
numpy==1.26
|