Zaious commited on
Commit
47c2ca7
·
verified ·
1 Parent(s): 8800100

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -7
app.py CHANGED
@@ -18,7 +18,7 @@ client = OpenAI(
18
  # 函数:解析PDF文件
19
  def extract_text_from_pdf(file_path):
20
  text = pdfminer.high_level.extract_text(file_path)
21
- print("=======ppp=======")
22
  print(text)
23
  return text
24
 
@@ -28,12 +28,17 @@ def extract_text_from_docx(file_path):
28
  text = ""
29
  for paragraph in doc.paragraphs:
30
  text += paragraph.text + "\n"
 
 
31
  return text
32
 
33
  # 函数:解析TXT文件
34
  def extract_text_from_txt(file_path):
35
  with open(file_path, "r", encoding="utf-8") as f:
36
  text = f.read()
 
 
 
37
  return text
38
 
39
  # 函数:根据文件类型选择解析函数
@@ -73,7 +78,7 @@ def split_text(text, max_length=500):
73
  return chunks
74
 
75
  # 函数:计算相似度并返回最相关的片段
76
- def find_most_relevant_section(input_text, file_texts):
77
  input_embedding = get_embedding(input_text)
78
  all_embeddings = []
79
  all_texts = []
@@ -83,16 +88,18 @@ def find_most_relevant_section(input_text, file_texts):
83
  all_texts.extend(chunks)
84
  all_embeddings.extend([get_embedding(chunk) for chunk in chunks])
85
 
86
- similarities = cosine_similarity([input_embedding], all_embeddings)
87
- most_relevant_index = np.argmax(similarities)
88
- return all_texts[most_relevant_index]
 
 
89
 
90
  # 定义处理上传文件和回答的函数
91
  def gpt_answer(brand_name, files):
92
  file_contents = [parse_file(file) for file in files]
93
- most_relevant_text = find_most_relevant_section(brand_name, file_contents)
94
 
95
- response = f"品牌名称: {brand_name}\n\n最相关的内容:\n{most_relevant_text}"
96
  return response
97
 
98
  def gpt_answer_a(question, files):
 
18
  # 函数:解析PDF文件
19
  def extract_text_from_pdf(file_path):
20
  text = pdfminer.high_level.extract_text(file_path)
21
+ print("=======ppa=======")
22
  print(text)
23
  return text
24
 
 
28
  text = ""
29
  for paragraph in doc.paragraphs:
30
  text += paragraph.text + "\n"
31
+ print("=======ppb=======")
32
+ print(text)
33
  return text
34
 
35
  # 函数:解析TXT文件
36
  def extract_text_from_txt(file_path):
37
  with open(file_path, "r", encoding="utf-8") as f:
38
  text = f.read()
39
+
40
+ print("=======ppc=======")
41
+ print(text)
42
  return text
43
 
44
  # 函数:根据文件类型选择解析函数
 
78
  return chunks
79
 
80
  # 函数:计算相似度并返回最相关的片段
81
+ def find_top_n_relevant_sections(input_text, file_texts , n):
82
  input_embedding = get_embedding(input_text)
83
  all_embeddings = []
84
  all_texts = []
 
88
  all_texts.extend(chunks)
89
  all_embeddings.extend([get_embedding(chunk) for chunk in chunks])
90
 
91
+ similarities = cosine_similarity([input_embedding], all_embeddings)[0]
92
+ top_n_indices = similarities.argsort()[-n:][::-1]
93
+ top_n_texts = [all_texts[i] for i in top_n_indices]
94
+
95
+ return top_n_texts
96
 
97
  # 定义处理上传文件和回答的函数
98
  def gpt_answer(brand_name, files):
99
  file_contents = [parse_file(file) for file in files]
100
+ most_relevant_text = find_top_n_relevant_sections(brand_name, file_contents,5)
101
 
102
+ response = f"品牌名称: {brand_name}\n\n最相关的内容:\n\n" + "\n\n".join(most_relevant_texts)
103
  return response
104
 
105
  def gpt_answer_a(question, files):