File size: 5,713 Bytes
0c6d13f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import pandas as pd
import datetime
import gradio as gr
import os
# It shows the demo data format in finetuning tab
def move_to(move,model_ans):
df_temp=pd.read_excel(os.path.join("model_ans",str(model_ans)))
id_temp=int((df_temp.loc[move])['id'])
ques_temp=(df_temp.loc[move])['question']
ans_temp=(df_temp.loc[move])['answer']
if int(move)>=len(df_temp)+1:
gr.Info(f"Number of questions: {len(df_temp)}")
move=0
return [
gr.Label(value=str(id_temp),label="ID"),
gr.Label(value=ques_temp,label="Question"),
gr.Label(value=ans_temp,label="Answer")
]
def display_table(path=r"data/demo_table_data.xlsx"):
df = pd.read_excel(path)
df_with_custom_index = df.head(2)
# df_with_custom_index.index = [f"Row {i+1}" for i in range(len(df_with_custom_index))]
html_table = df_with_custom_index.to_html(index=False)
return f"<div style='overflow-x:auto;'>{html_table}</div>"
def current_time():
# ff="model_ans_llama_finetuned486_rag_ensemble"
# df=pd.read_excel(r"model_ans/model_ans_mistral_finetuned486_rag_ensemble.xlsx")
current_datetime = datetime.datetime.now()
# file_name = current_datetime.strftime("%Y_%m_%d_%H_%M_%S")+ff
return current_datetime.strftime("%Y_%m_%d_%H_%M_%S")
# This function use in human evaluation
def random_ques_ans2():
import random
import pandas as pd
df=pd.read_excel(r"data/existing_dataset.xlsx")
id=random.randint(0,len(df))
ques_temp=(df.loc[id])['question']
ans_temp=""
return ques_temp,ans_temp
def score_report_bar():
path="score_report"
import os
import math
dat=[]
for x in os.listdir(path):
wh=[]
flag=0
for x2 in x:
if x2>='a' and x2<='z':
flag=1
wh.append(x2)
elif flag==1:
wh.append(" ")
wh=''.join(wh)
wh=wh.replace("model ans","")
wh=wh.replace("finetuned","")
wh=wh.replace(" "," ")
wh=wh.replace("xlsx","")
df_temp=pd.read_excel(os.path.join(path,x))
rating=sum(df_temp["rating"])/len(df_temp)
dat.append({
"Model Name":wh,
"Average Rating":rating
})
temp=pd.DataFrame(dat)
return temp
def parse_data(link,progress):
from bs4 import BeautifulSoup
import requests
import re
from docx import Document
from langchain_community.document_loaders import WebBaseLoader
s=set()
import time
start_time = time.time()
duration = 5
def get_links(url):
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'lxml')
links = []
for link in soup.find_all('a'):
link_url = link.get('href')
if link_url is not None and link_url.startswith('http'):
s.add(link_url)
links.append(link_url)
return links
# def write_to_file(links):
# with open('data.txt', 'a') as f:
# f.writelines(links)
def get_all_links(url):
for link in get_links(url):
if (time.time() - start_time) >= duration:
return
get_all_links(link)
def data_ret2(link):
loader = WebBaseLoader(f"{link}")
data = loader.load()
return data[0].page_content
# link = 'https://kuet.ac.bd'
s.add(link)
get_all_links(link)
li=list(s)
all_data=[]
for x in progress.tqdm(li):
try:
print("Link: ",x)
all_data.append(data_ret2(x))
except:
print("pass")
continue
all_data2 = re.sub(r'\n+', '\n\n', "\n".join(all_data))
all_data2=re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', all_data2)
document = Document()
document.add_paragraph(all_data2)
document.save(f'rag_data/{link}.docx')
print("Finished!!")
return
def all_contri_ans(id, ques):
folder_path = 'save_ques_ans'
data_frames = []
for filename in os.listdir(folder_path):
if filename.endswith(".xlsx") or filename.endswith(".xls"):
file_path = os.path.join(folder_path, filename)
df = pd.read_excel(file_path)
data_frames.append(df)
df_hum = pd.concat(data_frames, ignore_index=True)
temp=[]
for x,y in zip(df_hum['question'],df_hum['answer']):
if x==ques:
temp.append(y)
if len(temp)==0:
temp=["This question's answer is not available."]
return temp
import json
import os
def save_params_to_file(model_name,embedding_name, splitter_type_dropdown, chunk_size_slider,
chunk_overlap_slider, separator_textbox, max_tokens_slider, filename="params.txt"):
params = {
"model_name":model_name,
"embedding_name": embedding_name,
"splitter_type_dropdown": splitter_type_dropdown,
"chunk_size_slider": chunk_size_slider,
"chunk_overlap_slider": chunk_overlap_slider,
"separator_textbox": separator_textbox,
"max_tokens_slider": max_tokens_slider
}
with open(filename, 'w') as f:
json.dump(params, f)
with open("deploy//params.txt", 'w') as f:
json.dump(params, f)
def load_params_from_file(filename="params.txt"):
if os.path.exists(filename):
with open(filename, 'r') as f:
params = json.load(f)
return params
else:
return None |