Spaces:

Zaious
/

LiveRAG_QA

Build error

App Files Files Community

LiveRAG_QA / app.py

Zaious

Update app.py

3af6820 verified over 1 year ago

raw

history blame contribute delete

5.09 kB

	import gradio as gr
	import os
	import openai
	from openai import OpenAI
	import logging
	import fitz # PyMuPDF
	import pdfminer.high_level
	import docx
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity
	from concurrent.futures import ThreadPoolExecutor

	logging.basicConfig(level=logging.DEBUG)
	os.environ["OPENAI_API_KEY"] = os.environ["gptkey"]

	client = OpenAI(
	api_key=os.environ['OPENAI_API_KEY'], # this is also the default, it can be omitted
	)

	def generate_answer(brand_name,question, files):
	with ThreadPoolExecutor(max_workers=3) as executor:
	fun_1 = executor.submit(gpt_relevant_info,brand_name,question, files)
	info = fun_1.result()

	fun_2 = executor.submit(gpt_answer,brand_name,question, info)
	ans = fun_2.result()

	return ans,info

	# 函数：解析PDF文件
	def extract_text_from_pdf(file_path):
	pdf_document = fitz.open(file_path)

	total_pages = pdf_document.page_count
	print(f"总页数: {total_pages}")

	# 读取文本
	page = pdf_document.load_page(0) # 读取第一页
	text = page.get_text("text")
	print(f"第一页文本:\n{text}")

	text = ""
	for page_num in range(len(pdf_document)):
	page = pdf_document.load_page(page_num)
	text += page.get_text()
	return text

	# 函数：解析DOCX文件
	def extract_text_from_docx(file_path):
	doc = docx.Document(file_path)
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	print("=======ppb=======")
	print(text)
	return text

	# 函数：解析TXT文件
	def extract_text_from_txt(file_path):
	with open(file_path, "r", encoding="utf-8") as f:
	text = f.read()

	print("=======ppc=======")
	print(text)
	return text

	# 函数：根据文件类型选择解析函数
	def parse_file(file):
	file_path = file.name
	if file_path.endswith(".pdf"):
	return extract_text_from_pdf(file_path)
	elif file_path.endswith(".docx"):
	return extract_text_from_docx(file_path)
	elif file_path.endswith(".txt"):
	return extract_text_from_txt(file_path)
	else:
	return "Unsupported file type"

	# 函数：获取文本嵌入向量
	def get_embedding(text, model="text-embedding-3-small"):
	response = client.embeddings.create(input=text, model=model)
	em = response.data[0].embedding

	print("======emm=====")
	print(em)
	return em

	# 函数：将长文本切片为较小段落
	def split_text(text, max_length=500):
	sentences = text.split('.')
	chunks = []
	current_chunk = ""
	for sentence in sentences:
	if len(current_chunk) + len(sentence) <= max_length:
	current_chunk += sentence + '.'
	else:
	chunks.append(current_chunk)
	current_chunk = sentence + '.'
	if current_chunk:
	chunks.append(current_chunk)
	return chunks

	# 函数：计算相似度并返回最相关的片段
	def find_top_n_relevant_sections(input_text, file_texts , n):
	input_embedding = get_embedding(input_text)
	all_embeddings = []
	all_texts = []

	for text in file_texts:
	chunks = split_text(text)
	all_texts.extend(chunks)
	all_embeddings.extend([get_embedding(chunk) for chunk in chunks])

	similarities = cosine_similarity([input_embedding], all_embeddings)[0]
	top_n_indices = similarities.argsort()[-n:][::-1]
	top_n_texts = [all_texts[i] for i in top_n_indices]

	return top_n_texts

	# 定义处理上传文件和回答的函数
	def gpt_relevant_info(brand_name,question, files):
	file_contents = [parse_file(file) for file in files]

	ask = brand_name + question
	most_relevant_texts = find_top_n_relevant_sections(ask, file_contents,5)

	response = "\n\n".join(most_relevant_texts)
	return response

	def gpt_answer(brand_name,question, info):
	messages_base = [
	{"role": "system", "content": "請扮演一個具備專業知識的商業策略分析師"}
	]

	# Creating a prompt with a structured format for the Persona in Traditional Chinese
	prompt_text = f"請根據品牌：{brand_name}與你原本的知識以及參考資料：{info}來回答這個問題{question}」"
	messages_base.append({"role": "user", "content": prompt_text})

	#for _ in range(loop):
	response = client.chat.completions.create(
	model='gpt-4o',
	max_tokens=4096,
	temperature=0,
	messages=messages_base
	)
	completed_text = response.choices[0].message.content


	return completed_text

	demo = gr.Interface(
	fn=generate_answer,
	inputs=[
	gr.Text(label="品牌名稱",value="Toyota"),
	gr.Text(label="關注問題",value="電動車展業的發展"),
	gr.File(label="上傳文件", file_count="multiple"), # 增加文件上傳功能
	],
	outputs=[
	gr.Text(label="結果解釋",lines=30),
	gr.Text(label="向量資料",lines=10)
	],
	title="回答助手",
	description="根據上傳的檔案進行回答",
	allow_flagging="never", )
	demo.launch(share=True)