Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -18,7 +18,7 @@ client = OpenAI(
|
|
| 18 |
# 函数:解析PDF文件
|
| 19 |
def extract_text_from_pdf(file_path):
|
| 20 |
text = pdfminer.high_level.extract_text(file_path)
|
| 21 |
-
print("=======
|
| 22 |
print(text)
|
| 23 |
return text
|
| 24 |
|
|
@@ -28,12 +28,17 @@ def extract_text_from_docx(file_path):
|
|
| 28 |
text = ""
|
| 29 |
for paragraph in doc.paragraphs:
|
| 30 |
text += paragraph.text + "\n"
|
|
|
|
|
|
|
| 31 |
return text
|
| 32 |
|
| 33 |
# 函数:解析TXT文件
|
| 34 |
def extract_text_from_txt(file_path):
|
| 35 |
with open(file_path, "r", encoding="utf-8") as f:
|
| 36 |
text = f.read()
|
|
|
|
|
|
|
|
|
|
| 37 |
return text
|
| 38 |
|
| 39 |
# 函数:根据文件类型选择解析函数
|
|
@@ -73,7 +78,7 @@ def split_text(text, max_length=500):
|
|
| 73 |
return chunks
|
| 74 |
|
| 75 |
# 函数:计算相似度并返回最相关的片段
|
| 76 |
-
def
|
| 77 |
input_embedding = get_embedding(input_text)
|
| 78 |
all_embeddings = []
|
| 79 |
all_texts = []
|
|
@@ -83,16 +88,18 @@ def find_most_relevant_section(input_text, file_texts):
|
|
| 83 |
all_texts.extend(chunks)
|
| 84 |
all_embeddings.extend([get_embedding(chunk) for chunk in chunks])
|
| 85 |
|
| 86 |
-
similarities = cosine_similarity([input_embedding], all_embeddings)
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
| 89 |
|
| 90 |
# 定义处理上传文件和回答的函数
|
| 91 |
def gpt_answer(brand_name, files):
|
| 92 |
file_contents = [parse_file(file) for file in files]
|
| 93 |
-
most_relevant_text =
|
| 94 |
|
| 95 |
-
response = f"品牌名称: {brand_name}\n\n最相关的内容:\n
|
| 96 |
return response
|
| 97 |
|
| 98 |
def gpt_answer_a(question, files):
|
|
|
|
| 18 |
# 函数:解析PDF文件
|
| 19 |
def extract_text_from_pdf(file_path):
|
| 20 |
text = pdfminer.high_level.extract_text(file_path)
|
| 21 |
+
print("=======ppa=======")
|
| 22 |
print(text)
|
| 23 |
return text
|
| 24 |
|
|
|
|
| 28 |
text = ""
|
| 29 |
for paragraph in doc.paragraphs:
|
| 30 |
text += paragraph.text + "\n"
|
| 31 |
+
print("=======ppb=======")
|
| 32 |
+
print(text)
|
| 33 |
return text
|
| 34 |
|
| 35 |
# 函数:解析TXT文件
|
| 36 |
def extract_text_from_txt(file_path):
|
| 37 |
with open(file_path, "r", encoding="utf-8") as f:
|
| 38 |
text = f.read()
|
| 39 |
+
|
| 40 |
+
print("=======ppc=======")
|
| 41 |
+
print(text)
|
| 42 |
return text
|
| 43 |
|
| 44 |
# 函数:根据文件类型选择解析函数
|
|
|
|
| 78 |
return chunks
|
| 79 |
|
| 80 |
# 函数:计算相似度并返回最相关的片段
|
| 81 |
+
def find_top_n_relevant_sections(input_text, file_texts , n):
|
| 82 |
input_embedding = get_embedding(input_text)
|
| 83 |
all_embeddings = []
|
| 84 |
all_texts = []
|
|
|
|
| 88 |
all_texts.extend(chunks)
|
| 89 |
all_embeddings.extend([get_embedding(chunk) for chunk in chunks])
|
| 90 |
|
| 91 |
+
similarities = cosine_similarity([input_embedding], all_embeddings)[0]
|
| 92 |
+
top_n_indices = similarities.argsort()[-n:][::-1]
|
| 93 |
+
top_n_texts = [all_texts[i] for i in top_n_indices]
|
| 94 |
+
|
| 95 |
+
return top_n_texts
|
| 96 |
|
| 97 |
# 定义处理上传文件和回答的函数
|
| 98 |
def gpt_answer(brand_name, files):
|
| 99 |
file_contents = [parse_file(file) for file in files]
|
| 100 |
+
most_relevant_text = find_top_n_relevant_sections(brand_name, file_contents,5)
|
| 101 |
|
| 102 |
+
response = f"品牌名称: {brand_name}\n\n最相关的内容:\n\n" + "\n\n".join(most_relevant_texts)
|
| 103 |
return response
|
| 104 |
|
| 105 |
def gpt_answer_a(question, files):
|