Spaces:
Paused
Paused
back to phi
Browse files
README.md
CHANGED
|
@@ -10,6 +10,7 @@ pinned: false
|
|
| 10 |
preload_from_hub:
|
| 11 |
- "microsoft/phi-2"
|
| 12 |
- "BAAI/bge-small-en-v1.5"
|
|
|
|
|
|
|
| 13 |
---
|
| 14 |
|
| 15 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 10 |
preload_from_hub:
|
| 11 |
- "microsoft/phi-2"
|
| 12 |
- "BAAI/bge-small-en-v1.5"
|
| 13 |
+
- "HuggingFaceH4/zephyr-7b-alpha"
|
| 14 |
+
- "meta-llama/Meta-Llama-3-8B"
|
| 15 |
---
|
| 16 |
|
|
|
app.py
CHANGED
|
@@ -20,12 +20,13 @@ CHEAPMODE = torch.cuda.is_available()
|
|
| 20 |
# LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2"
|
| 21 |
|
| 22 |
config = {
|
| 23 |
-
"LLM": "meta-llama/Meta-Llama-3-8B",
|
| 24 |
-
|
|
|
|
| 25 |
"embeddings": "BAAI/bge-small-en-v1.5",
|
| 26 |
"similarity_top_k": 2,
|
| 27 |
"context_window": 4048,
|
| 28 |
-
"max_new_tokens":
|
| 29 |
"temperature": 0.7,
|
| 30 |
"top_k": 5,
|
| 31 |
"top_p": 0.95,
|
|
@@ -42,17 +43,17 @@ title = "Ask my thesis: Intelligent Automation for AI-Driven Document Understand
|
|
| 42 |
title = center_element(title)
|
| 43 |
description = """Chat with the thesis manuscript by asking questions and receive answers with reference to the page.
|
| 44 |
|
| 45 |
-
<div class="
|
| 46 |
<a href="https://jordy-vl.github.io/assets/phdthesis/VanLandeghem_Jordy_PhD-thesis.pdf">
|
| 47 |
<img src="https://ideogram.ai/api/images/direct/cc3Um6ClQkWJpVdXx6pWVA.png"
|
| 48 |
-
title="Thesis.pdf" alt="Ideogram image generated with prompt engineering"/></a>
|
| 49 |
-
</div>
|
| 50 |
|
| 51 |
Technology used: [Llama-index](https://www.llamaindex.ai/), OS LLMs from HuggingFace
|
| 52 |
|
| 53 |
-
Spoiler: a RAG application with a >1B LLM and online vector store can be quite slow on a 290 page document ⏳
|
| 54 |
"""
|
| 55 |
-
|
| 56 |
description = center_element(description)
|
| 57 |
|
| 58 |
def messages_to_prompt(messages):
|
|
@@ -105,6 +106,7 @@ def load_RAG_pipeline(config):
|
|
| 105 |
# Llama-index
|
| 106 |
Settings.llm = llm
|
| 107 |
Settings.embed_model = HuggingFaceEmbedding(model_name=config["embeddings"])
|
|
|
|
| 108 |
Settings.chunk_size = config["chunk_size"]
|
| 109 |
Settings.chunk_overlap = config["chunk_overlap"]
|
| 110 |
|
|
@@ -125,23 +127,16 @@ default_query_engine = load_RAG_pipeline(config)
|
|
| 125 |
|
| 126 |
# These are placeholder functions to simulate the behavior of the RAG setup.
|
| 127 |
# You would need to implement these with the actual logic to retrieve and generate answers based on the document.
|
| 128 |
-
def get_answer(question,
|
| 129 |
# Here you should implement the logic to generate an answer based on the question and the document.
|
| 130 |
# For example, you could use a machine learning model for RAG.
|
| 131 |
# answer = "This is a placeholder answer."
|
| 132 |
# https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations
|
| 133 |
|
| 134 |
# if temperature or nucleus sampling or max_tokens != as in config, recall query engine
|
| 135 |
-
|
| 136 |
-
temperature != config["temperature"]
|
| 137 |
-
or nucleus_sampling != config["top_p"]
|
| 138 |
-
or max_tokens != config["max_new_tokens"]
|
| 139 |
-
):
|
| 140 |
-
config["temperature"] = temperature
|
| 141 |
-
config["top_p"] = nucleus_sampling
|
| 142 |
-
config["max_new_tokens"] = max_tokens
|
| 143 |
-
query_engine = load_RAG_pipeline(config)
|
| 144 |
response = query_engine.query(question)
|
|
|
|
| 145 |
return response
|
| 146 |
|
| 147 |
|
|
@@ -156,32 +151,87 @@ def get_answer_page(response):
|
|
| 156 |
|
| 157 |
# Create the gr.Interface function
|
| 158 |
def ask_my_thesis(
|
| 159 |
-
question,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
):
|
| 161 |
print(f"Got Q: {question}")
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
image, answer_page = get_answer_page(answer)
|
| 164 |
-
return answer, image, answer_page
|
| 165 |
|
| 166 |
|
| 167 |
# Set up the interface options based on the design in the image.
|
| 168 |
output_image = gr.Image(label="Answer Page")
|
| 169 |
|
| 170 |
# examples
|
| 171 |
-
examples = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
iface = gr.Interface(
|
| 174 |
fn=ask_my_thesis,
|
| 175 |
inputs=[gr.Textbox(label="Question", placeholder="Type your question here...")],
|
| 176 |
-
additional_inputs=
|
| 177 |
-
gr.Slider(0, 1, value=0.7, label="Temperature"),
|
| 178 |
-
gr.Slider(0, 1, value=0.95, label="Nucleus Sampling"),
|
| 179 |
-
gr.Slider(1, 500, value=150, label="Max Generated Number of Tokens"),
|
| 180 |
-
],
|
| 181 |
outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
|
|
|
|
| 182 |
title=title,
|
| 183 |
description=description,
|
| 184 |
-
allow_flagging="
|
|
|
|
| 185 |
)
|
| 186 |
# https://github.com/gradio-app/gradio/issues/4309
|
| 187 |
|
|
|
|
| 20 |
# LLM = "HuggingFaceH4/zephyr-7b-alpha" if not CHEAPMODE else "microsoft/phi-2"
|
| 21 |
|
| 22 |
config = {
|
| 23 |
+
# "LLM": "meta-llama/Meta-Llama-3-8B",
|
| 24 |
+
"LLM": "microsoft/phi-2",
|
| 25 |
+
# "LLM": "HuggingFaceH4/zephyr-7b-alpha",
|
| 26 |
"embeddings": "BAAI/bge-small-en-v1.5",
|
| 27 |
"similarity_top_k": 2,
|
| 28 |
"context_window": 4048,
|
| 29 |
+
"max_new_tokens": 200,
|
| 30 |
"temperature": 0.7,
|
| 31 |
"top_k": 5,
|
| 32 |
"top_p": 0.95,
|
|
|
|
| 43 |
title = center_element(title)
|
| 44 |
description = """Chat with the thesis manuscript by asking questions and receive answers with reference to the page.
|
| 45 |
|
| 46 |
+
<div class="center">
|
| 47 |
<a href="https://jordy-vl.github.io/assets/phdthesis/VanLandeghem_Jordy_PhD-thesis.pdf">
|
| 48 |
<img src="https://ideogram.ai/api/images/direct/cc3Um6ClQkWJpVdXx6pWVA.png"
|
| 49 |
+
title="Thesis.pdf" alt="Ideogram image generated with prompt engineering" width="500" class="center"/></a>
|
| 50 |
+
</div> Click the visual above to be redirected to the PDF of the manuscript.
|
| 51 |
|
| 52 |
Technology used: [Llama-index](https://www.llamaindex.ai/), OS LLMs from HuggingFace
|
| 53 |
|
| 54 |
+
Spoiler: a quickly hacked together RAG application with a >1B LLM and online vector store can be quite slow on a 290 page document ⏳ (10s+)
|
| 55 |
"""
|
| 56 |
+
|
| 57 |
description = center_element(description)
|
| 58 |
|
| 59 |
def messages_to_prompt(messages):
|
|
|
|
| 106 |
# Llama-index
|
| 107 |
Settings.llm = llm
|
| 108 |
Settings.embed_model = HuggingFaceEmbedding(model_name=config["embeddings"])
|
| 109 |
+
print(Settings)
|
| 110 |
Settings.chunk_size = config["chunk_size"]
|
| 111 |
Settings.chunk_overlap = config["chunk_overlap"]
|
| 112 |
|
|
|
|
| 127 |
|
| 128 |
# These are placeholder functions to simulate the behavior of the RAG setup.
|
| 129 |
# You would need to implement these with the actual logic to retrieve and generate answers based on the document.
|
| 130 |
+
def get_answer(question, config, query_engine=default_query_engine):
|
| 131 |
# Here you should implement the logic to generate an answer based on the question and the document.
|
| 132 |
# For example, you could use a machine learning model for RAG.
|
| 133 |
# answer = "This is a placeholder answer."
|
| 134 |
# https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/settings/#setting-local-configurations
|
| 135 |
|
| 136 |
# if temperature or nucleus sampling or max_tokens != as in config, recall query engine
|
| 137 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
response = query_engine.query(question)
|
| 139 |
+
print(f"A: {response}")
|
| 140 |
return response
|
| 141 |
|
| 142 |
|
|
|
|
| 151 |
|
| 152 |
# Create the gr.Interface function
|
| 153 |
def ask_my_thesis(
|
| 154 |
+
question,
|
| 155 |
+
LLM=config["LLM"],
|
| 156 |
+
embeddings=config["embeddings"],
|
| 157 |
+
similarity_top_k=config["similarity_top_k"],
|
| 158 |
+
context_window=config["context_window"],
|
| 159 |
+
max_new_tokens=config["max_new_tokens"],
|
| 160 |
+
temperature=config["temperature"],
|
| 161 |
+
top_k=config["top_k"],
|
| 162 |
+
top_p=config["top_p"],
|
| 163 |
+
chunk_size=config["chunk_size"],
|
| 164 |
+
chunk_overlap=config["chunk_overlap"],
|
| 165 |
):
|
| 166 |
print(f"Got Q: {question}")
|
| 167 |
+
query_engine = default_query_engine
|
| 168 |
+
|
| 169 |
+
# if any change in kwargs
|
| 170 |
+
# Check if any of the kwargs have changed
|
| 171 |
+
if (
|
| 172 |
+
temperature != config["temperature"]
|
| 173 |
+
or top_p != config["top_p"]
|
| 174 |
+
or max_new_tokens != config["max_new_tokens"]
|
| 175 |
+
or LLM != config["LLM"]
|
| 176 |
+
or embeddings != config["embeddings"]
|
| 177 |
+
or similarity_top_k != config["similarity_top_k"]
|
| 178 |
+
or context_window != config["context_window"]
|
| 179 |
+
or top_k != config["top_k"]
|
| 180 |
+
or chunk_size != config["chunk_size"]
|
| 181 |
+
or chunk_overlap != config["chunk_overlap"]
|
| 182 |
+
):
|
| 183 |
+
# Update the config dictionary with the new values
|
| 184 |
+
config["temperature"] = temperature
|
| 185 |
+
config["top_p"] = top_p
|
| 186 |
+
config["max_new_tokens"] = max_new_tokens
|
| 187 |
+
# config["LLM"] = LLM
|
| 188 |
+
# config["embeddings"] = embeddings
|
| 189 |
+
config["similarity_top_k"] = similarity_top_k
|
| 190 |
+
config["context_window"] = context_window
|
| 191 |
+
config["top_k"] = top_k
|
| 192 |
+
config["chunk_size"] = chunk_size
|
| 193 |
+
config["chunk_overlap"] = chunk_overlap
|
| 194 |
+
query_engine = load_RAG_pipeline(config)
|
| 195 |
+
|
| 196 |
+
answer = get_answer(question, config, query_engine=query_engine)
|
| 197 |
image, answer_page = get_answer_page(answer)
|
| 198 |
+
return answer.response, image, answer_page
|
| 199 |
|
| 200 |
|
| 201 |
# Set up the interface options based on the design in the image.
|
| 202 |
output_image = gr.Image(label="Answer Page")
|
| 203 |
|
| 204 |
# examples
|
| 205 |
+
examples = [
|
| 206 |
+
["What model is state-of-the-art on DUDE?"],
|
| 207 |
+
["Why is knowledge distillation interesting?"],
|
| 208 |
+
["What is ANLS?"],
|
| 209 |
+
]
|
| 210 |
+
# Define additional Gradio input elements
|
| 211 |
+
additional_inputs = [
|
| 212 |
+
# gr.Input("text", label="Question"),
|
| 213 |
+
# gr.Input("text", label="LLM", value=config["LLM"]),
|
| 214 |
+
# gr.Input("text", label="Embeddings", value=config["embeddings"]),
|
| 215 |
+
gr.Slider(1, 5, value=config["similarity_top_k"], label="Similarity Top K"),
|
| 216 |
+
gr.Slider(512, 8048, value=config["context_window"], label="Context Window"),
|
| 217 |
+
gr.Slider(20, 250, value=config["max_new_tokens"], label="Max New Tokens"),
|
| 218 |
+
gr.Slider(0, 1, value=config["temperature"], label="Temperature"),
|
| 219 |
+
gr.Slider(1, 10, value=config["top_k"], label="Top K"),
|
| 220 |
+
gr.Slider(0, 1, value=config["top_p"], label="Nucleus Sampling"),
|
| 221 |
+
gr.Slider(128, 4024, value=config["chunk_size"], label="Chunk Size"),
|
| 222 |
+
gr.Slider(0, 200, value=config["chunk_overlap"], label="Chunk Overlap"),
|
| 223 |
+
]
|
| 224 |
|
| 225 |
iface = gr.Interface(
|
| 226 |
fn=ask_my_thesis,
|
| 227 |
inputs=[gr.Textbox(label="Question", placeholder="Type your question here...")],
|
| 228 |
+
additional_inputs=additional_inputs,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
outputs=[gr.Textbox(label="Answer"), output_image, gr.Label()],
|
| 230 |
+
examples=examples,
|
| 231 |
title=title,
|
| 232 |
description=description,
|
| 233 |
+
allow_flagging="auto",
|
| 234 |
+
cache_examples=True,
|
| 235 |
)
|
| 236 |
# https://github.com/gradio-app/gradio/issues/4309
|
| 237 |
|