Spaces:

Zaious
/

LiveRAG_QA

Build error

App Files Files Community

Zaious commited on May 27, 2024

Commit

47c2ca7

verified ·

1 Parent(s): 8800100

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -7

app.py CHANGED Viewed

@@ -18,7 +18,7 @@ client = OpenAI(
 # 函数：解析PDF文件
 def extract_text_from_pdf(file_path):
     text = pdfminer.high_level.extract_text(file_path)
-    print("=======ppp=======")
     print(text)
     return text
@@ -28,12 +28,17 @@ def extract_text_from_docx(file_path):
     text = ""
     for paragraph in doc.paragraphs:
         text += paragraph.text + "\n"
     return text
 # 函数：解析TXT文件
 def extract_text_from_txt(file_path):
     with open(file_path, "r", encoding="utf-8") as f:
         text = f.read()
     return text
 # 函数：根据文件类型选择解析函数
@@ -73,7 +78,7 @@ def split_text(text, max_length=500):
     return chunks
 # 函数：计算相似度并返回最相关的片段
-def find_most_relevant_section(input_text, file_texts):
     input_embedding = get_embedding(input_text)
     all_embeddings = []
     all_texts = []
@@ -83,16 +88,18 @@ def find_most_relevant_section(input_text, file_texts):
         all_texts.extend(chunks)
         all_embeddings.extend([get_embedding(chunk) for chunk in chunks])
-    similarities = cosine_similarity([input_embedding], all_embeddings)
-    most_relevant_index = np.argmax(similarities)
-    return all_texts[most_relevant_index]
 # 定义处理上传文件和回答的函数
 def gpt_answer(brand_name, files):
     file_contents = [parse_file(file) for file in files]
-    most_relevant_text = find_most_relevant_section(brand_name, file_contents)
-    response = f"品牌名称: {brand_name}\n\n最相关的内容:\n{most_relevant_text}"
     return response
 def gpt_answer_a(question, files):

 # 函数：解析PDF文件
 def extract_text_from_pdf(file_path):
     text = pdfminer.high_level.extract_text(file_path)
+    print("=======ppa=======")
     print(text)
     return text
     text = ""
     for paragraph in doc.paragraphs:
         text += paragraph.text + "\n"
+    print("=======ppb=======")
+    print(text)
     return text
 # 函数：解析TXT文件
 def extract_text_from_txt(file_path):
     with open(file_path, "r", encoding="utf-8") as f:
         text = f.read()
+    print("=======ppc=======")
+    print(text)
     return text
 # 函数：根据文件类型选择解析函数
     return chunks
 # 函数：计算相似度并返回最相关的片段
+def find_top_n_relevant_sections(input_text, file_texts , n):
     input_embedding = get_embedding(input_text)
     all_embeddings = []
     all_texts = []
         all_texts.extend(chunks)
         all_embeddings.extend([get_embedding(chunk) for chunk in chunks])
+    similarities = cosine_similarity([input_embedding], all_embeddings)[0]
+    top_n_indices = similarities.argsort()[-n:][::-1]
+    top_n_texts = [all_texts[i] for i in top_n_indices]
+    return top_n_texts
 # 定义处理上传文件和回答的函数
 def gpt_answer(brand_name, files):
     file_contents = [parse_file(file) for file in files]
+    most_relevant_text = find_top_n_relevant_sections(brand_name, file_contents,5)
+    response = f"品牌名称: {brand_name}\n\n最相关的内容:\n\n" + "\n\n".join(most_relevant_texts)
     return response
 def gpt_answer_a(question, files):