Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import hnswlib
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
| 5 |
+
import os
|
| 6 |
+
from together import Together
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from cryptography.fernet import Fernet
|
| 9 |
+
import gzip
|
| 10 |
+
import io
|
| 11 |
+
|
| 12 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 13 |
+
|
| 14 |
+
load_dotenv()
|
| 15 |
+
|
| 16 |
+
client = Together(api_key=os.environ.get("API_KEY"))
|
| 17 |
+
|
| 18 |
+
#read data
|
| 19 |
+
fernet = Fernet(os.environ.get("KEY2").encode("utf-8"))
|
| 20 |
+
|
| 21 |
+
with gzip.open("corpus.gz",'rb') as f:
|
| 22 |
+
bytes_enc = f.read()
|
| 23 |
+
pq_bytes = fernet.decrypt(bytes_enc)
|
| 24 |
+
pq_file = io.BytesIO(pq_bytes)
|
| 25 |
+
corpus = pd.read_parquet(pq_file)
|
| 26 |
+
|
| 27 |
+
biencoder = SentenceTransformer("intfloat/multilingual-e5-small", device="cpu")
|
| 28 |
+
embedding_size = biencoder.get_sentence_embedding_dimension()
|
| 29 |
+
|
| 30 |
+
crossencoder = CrossEncoder("KennethTM/MiniLM-L6-danish-reranker", device="cpu")
|
| 31 |
+
|
| 32 |
+
index = hnswlib.Index(space = 'cosine', dim = embedding_size)
|
| 33 |
+
index.load_index("corpus.index")
|
| 34 |
+
index.set_ef(40)
|
| 35 |
+
|
| 36 |
+
state = {}
|
| 37 |
+
|
| 38 |
+
source_label = {"wiki": "Wikipedia", "lex": "lex.dk", "mfkn": "MFKN", "dce": "DCE"}
|
| 39 |
+
|
| 40 |
+
def format_markdown(results):
|
| 41 |
+
result_template = '### {idx}. [{title}]({url}) ({source}):\n"{text}"'
|
| 42 |
+
result_join = "\n\n".join([result_template.format(idx=i+1, source=source_label[source], title=title, url=url, text=text)
|
| 43 |
+
for i, (title, source, url, text) in enumerate(zip(results["title"], results["source"], results["url"], results["text_chunks"]))])
|
| 44 |
+
results_formatted = f"## Referencer:\n\n{result_join}"
|
| 45 |
+
|
| 46 |
+
return(results_formatted)
|
| 47 |
+
|
| 48 |
+
def format_context(results):
|
| 49 |
+
result_template = "Kilde {idx}:\n{text}"
|
| 50 |
+
result_join = "\n\n".join([result_template.format(idx=i+1, text=text) for i, text in enumerate(results["text_chunks"])])
|
| 51 |
+
|
| 52 |
+
return(result_join)
|
| 53 |
+
|
| 54 |
+
def search(query, top_k):
|
| 55 |
+
|
| 56 |
+
query_embedding = biencoder.encode(query, prompt = "query: ")
|
| 57 |
+
|
| 58 |
+
biencoder_hits = int(top_k)*2
|
| 59 |
+
ids, _ = index.knn_query(query_embedding, k = biencoder_hits)
|
| 60 |
+
ids = ids[0]
|
| 61 |
+
|
| 62 |
+
results = corpus.iloc[ids].copy()
|
| 63 |
+
results["scores"] = crossencoder.predict([(query, i) for i in results["text_chunks"]])
|
| 64 |
+
results = results.sort_values("scores", ascending=False)
|
| 65 |
+
results = results[:int(top_k)]
|
| 66 |
+
|
| 67 |
+
results_markdown = format_markdown(results)
|
| 68 |
+
results_context = format_context(results)
|
| 69 |
+
|
| 70 |
+
state["context"] = results_context
|
| 71 |
+
state["query"] = query
|
| 72 |
+
|
| 73 |
+
return(results_markdown)
|
| 74 |
+
|
| 75 |
+
def search_summary():
|
| 76 |
+
context = state["context"]
|
| 77 |
+
query = state["query"]
|
| 78 |
+
|
| 79 |
+
prompt = [{"role": "system", "content": "Svar på spørgsmålet. Du er ekspert i spørgsmål indenfor natur og miljø. Anvend kilderne i konteksten hvis de kan bruges til besvarelsen. Besvar kun på dansk."},
|
| 80 |
+
{"role": "user", "content": f"Kontekst:\n{context}\n\nSpørgsmål:\n{query}"}]
|
| 81 |
+
|
| 82 |
+
stream = client.chat.completions.create(
|
| 83 |
+
model="meta-llama/Llama-3-8b-chat-hf",
|
| 84 |
+
messages=prompt,
|
| 85 |
+
stream=True,
|
| 86 |
+
max_tokens=1024
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
partial_message = ""
|
| 90 |
+
for chunk in stream:
|
| 91 |
+
partial_message += chunk.choices[0].delta.content or ""
|
| 92 |
+
yield partial_message
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
with gr.Blocks() as demo:
|
| 96 |
+
|
| 97 |
+
gr.Markdown("# Natur og miljø BOT")
|
| 98 |
+
gr.Markdown("Dette er en simpel spørgsmål-svar applikation indenfor Danmarks natur og miljø. Svar genereres af en sprogmodel (LLAMA-3-8B) og anvender relevante referencer i en stor samling af dokumenter. Dette er blandt andet artikler fra [Wikipedia](https://da.wikipedia.org/wiki/Forside), rapporter fra [DCE - Nationalt Center for Miljø og Energi](https://dce.au.dk/udgivelser), [lex.dk - Den Store Danske](https://denstoredanske.lex.dk/) samt sager fra [Miljø og fødevareklagenævnet](https://mfkn.naevneneshus.dk).")
|
| 99 |
+
|
| 100 |
+
with gr.Row():
|
| 101 |
+
textbox = gr.Textbox(placeholder="Søg...", lines=1, scale=8, label="Spørgsmål")
|
| 102 |
+
num = gr.Number(5, label="Referencer", scale=1, minimum=1, maximum=10)
|
| 103 |
+
btn = gr.Button("Søg!", size="sm", scale=2)
|
| 104 |
+
|
| 105 |
+
with gr.Row():
|
| 106 |
+
summary = gr.Textbox(interactive=False, lines=10, label="Svar")
|
| 107 |
+
|
| 108 |
+
with gr.Row():
|
| 109 |
+
results = gr.Markdown()
|
| 110 |
+
|
| 111 |
+
gr.Markdown("*Applikation lavet af Kenneth Thorø Martinsen (email: kenneth2810@gmail.com)*")
|
| 112 |
+
|
| 113 |
+
btn.click(fn=search, inputs=[textbox, num], outputs=results).then(search_summary, inputs=None, outputs=summary)
|
| 114 |
+
textbox.submit(fn=search, inputs=[textbox, num], outputs=results).then(search_summary, inputs=None, outputs=summary)
|
| 115 |
+
|
| 116 |
+
demo.queue().launch()
|