File size: 8,296 Bytes
dea2c24
226ef43
11f5bf4
 
 
 
 
 
 
 
1e934ec
dea2c24
 
 
 
2216dc7
aecb436
11f5bf4
 
 
 
 
 
7d505c6
11f5bf4
 
 
 
 
 
 
 
aecb436
11f5bf4
 
 
 
 
 
 
 
 
0538474
11f5bf4
 
 
 
 
 
 
 
 
 
 
bce90b4
 
11f5bf4
 
 
 
 
 
 
 
 
 
 
 
 
 
bce90b4
4055c38
1c303f5
2ff8c45
 
 
 
 
 
42dcd1d
11f5bf4
 
ff6272b
 
 
11f5bf4
ff6272b
 
11f5bf4
 
a3fb620
1e934ec
13fe061
ff6272b
83ebb0e
 
1b25a35
278db2f
83ebb0e
1e934ec
 
 
11f5bf4
323dc59
947860c
323dc59
235395f
d7b1a24
 
b8b8bd6
b2cfb1e
 
 
d7b1a24
b2cfb1e
d7b1a24
 
47825be
d7b1a24
 
dc3e4c0
 
a9ddb96
0b92789
11f5bf4
 
 
dea2c24
 
 
 
 
 
 
ead1b81
 
1b5d3fa
937c441
 
 
15e96ef
47825be
e535008
 
 
 
 
f8ce3be
e535008
228bf86
0de2123
e535008
 
 
 
8ea7d45
e535008
10d33a9
398c271
da57524
dea2c24
 
 
55d391a
 
b9b35f4
a9ddb96
dea2c24
 
 
 
3bab283
dea2c24
3bab283
dea2c24
 
 
 
 
 
 
 
 
 
065778e
 
 
 
 
 
 
 
 
 
 
dea2c24
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import gradio as gr
from huggingface_hub import InferenceClient
from huggingface_hub import login
import re
import pandas as pd
from langchain.schema import Document
from langchain.text_splitter import TokenTextSplitter
from transformers import AutoTokenizer
import copy
from langchain_community.retrievers import BM25Retriever
from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
df1 = pd.read_csv("./data/champions_data_lol.csv")
df1['Story'] = df1['Story'].astype(str)
# Pre-processing
def preprocess_for_bm25(text):
    # Replace "..." with a unique placeholder
    text = text.replace("...", " _ELLIPSIS_ ")  

    # Add space before and after punctuation (except "_ELLIPSIS_")
    text = re.sub(r'([.,!?()"\'])', r' \1 ', text)  # General case for punctuation

    # Restore "..." from the placeholder
    text = text.replace("_ELLIPSIS_", "...")  

    # Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()
    return text

"""Pre-processing"""
# Convert DataFrame to documents
documents = []
for _, row in df1.iterrows():
    biography_text = row['Story']
    documents.append(Document(
            page_content= biography_text,  # Text of the chunk
            metadata= {
            'champion_name': row['Champion'],
            'role': row['Role']}
        ))

"""Chunking"""

# Specify the model name
EMBEDDING_MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
tokenizer_name = EMBEDDING_MODEL_NAME

# Token splitting for more context split
text_splitter = TokenTextSplitter.from_huggingface_tokenizer(
    tokenizer=AutoTokenizer.from_pretrained(tokenizer_name),
    chunk_size=150,
    chunk_overlap=15
)

chunks = text_splitter.split_documents(documents) # chunks used for LLM generation

chunks_bm25 = copy.deepcopy(chunks)  # Creates an independent copy, chunks used for BM25 retriever

for i, doc in enumerate(chunks_bm25):
    doc.page_content = preprocess_for_bm25(doc.page_content)  # Modify page_content in place
    doc.metadata["index"] = i  # Add an index for tracking

for i, doc in enumerate(chunks):
    doc.metadata["index"] = i  # Add an index for tracking

"""Retriever"""
bm25_retriever = BM25Retriever.from_documents(chunks_bm25, k = 4) # 2 most similar contexts

def retriever(query):
    tmp = bm25_retriever.invoke(preprocess_for_bm25(query))
    context = []
    for doc in tmp:
        index = doc.metadata['index']
        context.append(chunks[index])
    return context

"""Chain"""

#from langchain_core.runnables.passthrough import RunnablePassthrough
#from langchain.prompts import ChatPromptTemplate
#from langchain_core.output_parsers.string import StrOutputParser
from langchain_community.llms.huggingface_hub import HuggingFaceHub
#=import os
#from langchain_core.runnables import RunnableLambda


#prompt_template = ChatPromptTemplate.from_template(prompt)
"""llm = HuggingFaceHub(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    #repo_id="google-bert/bert-base-uncased",
    model_kwargs={
        "temperature": 0.1,
        "max_length": 5,
        "return_full_text": False
    }
"""

# Set the correct endpoint and task for the model

def ra(user_question):
    #prompt = f"You know things about League of Legends. Please correct the following question for grammar and clarity. Do not give explaination."
    #prompt = f"You know things about League of Legends. Please ONLY correct the following question for grammar and clarity. Do NOT give explaination:\n{user_question}\nCorrected question:"
    #You are an expert in League of Legends. You correct grammar and clarity issues in questions. Only return the corrected question itself—do not add explanations, extra text, or anything in parentheses.
    #res1 = client_bis.text_generation(f"You know things about League of Legends. Please correct the following question for grammar and clarity.Do not give explaination:\n{user_question}\nCorrected question:", stream =False,max_new_tokens= 10, temperature = 0.1 )
    messages_q=[
        {"role": "system", "content": "You are familiar with League of Legends lore. You help correct grammar and clarity without giving additional explanations."},
            #{"role": "system", "content": f"""You are a helpful AI that corrects grammar and clarity without giving additional explanations. You only return the corrected question itself.
            #"""},
        {"role": "user", "content": f"Fix any grammar or clarity issues in the following question. Only return the corrected question itself.\n\n{user_question}"}
        #Fix any grammar or clarity issues in the following question. Only return the output itself.\n\n{user_question}\nOutput: 
        #{"role": "user", "content": f"Please fix any grammar and clarity issues in the following question: {user_question}"}
    ]
    print(messages_q)
    res = client.chat_completion(messages_q,
        max_tokens=30,
        stream=False,
        temperature=0.1,
        stop = ['('])
    return copy.deepcopy(res["choices"][0]["message"]["content"])

# chain = RunnablePassthrough() | RunnableLambda(ra) | prompt_template | client.chat_completion() | StrOutputParser() for notebook

"""-------------------------------------------------------------------"""
def respond(
    message,
    history: list[tuple[str, str]],
    max_tokens,
    temperature,
    top_p,
):
    new_query = ra(message)
    print("old: ",new_query)
    #new_query = str(new_query.split("Output: ", 1)[-1] if "Output: " in new_query else new_query)
    if new_query[-1] == "(":
        new_query = new_query[:-1] 
    print("new: ",new_query)
    system_message = f"""You are an expert in League of Legends (LoL) lore. You will only answer questions related to the champions and their stories within the game.

Instructions:
1. Use only the provided context to answer. Do not make assumptions beyond it.
2. If a question is outside LoL lore, respond: "Please ask something related to League of Legends lore."
3. If the context lacks a clear answer, respond: "I'm unsure based on the provided context."
4. Answer up to two sentences, ensuring clarity and completeness.

"""
    system_message = f"""
You are an expert in League of Legends (LoL) lore. You will only answer questions related to the champions and their stories within the game.=
Instructions:
1. Only use the context provided below to answer the question. Reference the context directly for accuracy.
2. If the question is outside the scope of League of Legends lore, respond: "Please ask something related to League of Legends lore."
3. If the provided context does not provide a clear answer, respond: "I'm unsure based on the provided context."

"""
    print(system_message)
    messages = [{"role": "system", "content": "You are an expert in League of Legends (LoL) lore. You will only answer questions related to the champions and their stories within the game."}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
    messages.append({"role": "user", "content": "Context: "+str(retriever(new_query))+"\n\nQuestion: "+new_query +"\n\nAnswer: "})
    print("Context: "+str(retriever(new_query))+"\n\nQuestion: "+new_query +"\n\nAnswer: ")
    response = ""

    for message in client.chat_completion(
        messages,
        max_tokens=200,
        stream=True,
        temperature=0.1
    ):
        token = message.choices[0].delta.content

        response += token
        yield response


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
with gr.Blocks() as demo:
    gr.Markdown("""
    # League of Legends Lore Chatbot  
    Welcome to the **LoL Lore Chatbot**! 🏆  
    Here, you can ask questions about League of Legends champions and their stories.  

    **Example Question:**  
    *Why does Kayn have different forms?*  
    """)
    
    chat = gr.ChatInterface(respond)


if __name__ == "__main__":
    demo.launch()