Spaces:
Runtime error
Runtime error
Feature(MInference): update information
Browse files
app.py
CHANGED
|
@@ -14,12 +14,15 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
| 14 |
|
| 15 |
|
| 16 |
DESCRIPTION = """
|
| 17 |
-
# MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via Dynamic Sparse Attention
|
| 18 |
_Huiqiang Jiang†, Yucheng Li†, Chengruidong Zhang†, Qianhui Wu, Xufang Luo, Surin Ahn, Zhenhua Han, Amir H. Abdi, Dongsheng Li, Chin-Yew Lin, Yuqing Yang and Lili Qiu_
|
| 19 |
|
| 20 |
<h2 style="text-align: center;"><a href="https://github.com/microsoft/MInference" target="blank"> [Code]</a>
|
| 21 |
-
<a href="https://
|
| 22 |
-
<a href="https://arxiv.org/abs/
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
<font color="brown"><b>This is only a deployment demo. Due to limited GPU resources, we do not provide an online demo. You will need to follow the code below to try MInference locally.</b></font>
|
| 25 |
|
|
@@ -55,7 +58,7 @@ h1 {
|
|
| 55 |
"""
|
| 56 |
|
| 57 |
# Load the tokenizer and model
|
| 58 |
-
model_name = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
|
| 59 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 60 |
model = AutoModelForCausalLM.from_pretrained(
|
| 61 |
model_name, torch_dtype="auto", device_map="auto"
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
DESCRIPTION = """
|
| 17 |
+
# [MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via Dynamic Sparse Attention](https://aka.ms/MInference)(Under Review, ES-FoMo @ ICML'24)
|
| 18 |
_Huiqiang Jiang†, Yucheng Li†, Chengruidong Zhang†, Qianhui Wu, Xufang Luo, Surin Ahn, Zhenhua Han, Amir H. Abdi, Dongsheng Li, Chin-Yew Lin, Yuqing Yang and Lili Qiu_
|
| 19 |
|
| 20 |
<h2 style="text-align: center;"><a href="https://github.com/microsoft/MInference" target="blank"> [Code]</a>
|
| 21 |
+
<a href="https://aka.ms/MInference" target="blank"> [Project Page]</a>
|
| 22 |
+
<a href="https://arxiv.org/abs/2407" target="blank"> [Paper]</a></h2>
|
| 23 |
+
|
| 24 |
+
## News
|
| 25 |
+
- 🧩 We will present **MInference 1.0** at the _**Microsoft Booth**_ and _**ES-FoMo**_ at ICML'24. See you in Vienna!
|
| 26 |
|
| 27 |
<font color="brown"><b>This is only a deployment demo. Due to limited GPU resources, we do not provide an online demo. You will need to follow the code below to try MInference locally.</b></font>
|
| 28 |
|
|
|
|
| 58 |
"""
|
| 59 |
|
| 60 |
# Load the tokenizer and model
|
| 61 |
+
model_name = "gradientai/Llama-3-8B-Instruct-Gradient-1048k" if torch.cuda.is_available() else "Qwen/Qwen2-0.5B"
|
| 62 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 63 |
model = AutoModelForCausalLM.from_pretrained(
|
| 64 |
model_name, torch_dtype="auto", device_map="auto"
|