fudii0921 commited on
Commit
c7ff072
·
verified ·
1 Parent(s): 19b3605

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -25
app.py CHANGED
@@ -15,13 +15,18 @@ co_rerank = cohere.ClientV2(os.environ.get("COHERE_API_KEY"))
15
 
16
  vectored = None
17
 
18
- #dataid = requests.get("https://www.ryhintl.com/dbjson/getjson?sqlcmd=select * from company_matters")
19
 
20
- #data_str = dataid.content.decode('utf-8')
21
- #data = json.loads(data_str)
22
 
23
- # デコード関数
24
  '''def decode_text(data):
 
 
 
 
 
 
25
  for item in data:
26
  try:
27
  # latin1 でデコードし、utf-8に変換
@@ -29,19 +34,62 @@ vectored = None
29
  except UnicodeDecodeError as e:
30
  print(f"エラー: {e}")
31
  item['text'] = "[デコード失敗]"
32
- return data'''
33
-
34
-
35
-
36
- #raw_documents = [{"title": "会議議事録", "url": os.environ.get("URL")+"output_cm.html"}]
37
- raw_documents = [{"title": "生成AI", "url": "https://ja.wikipedia.org/wiki/生成的人工知能"}]
38
- '''raw_documents = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  {"title": "バグダードの戦い", "url": "https://ja.wikipedia.org/wiki/バグダードの戦い"},
40
  {"title": "2006年トリノオリンピック", "url": "https://ja.wikipedia.org/wiki/2006年トリノオリンピック"},
41
  {"title": "ドレッドノート_(戦艦)", "url": "https://ja.wikipedia.org/wiki/ドレッドノート_(戦艦)"},
42
- {"title": "生成AI", "url": "https://ja.wikipedia.org/wiki/生成的人工知能"}
43
- ]'''
 
44
 
 
45
 
46
  # あなたのクラスとロジックをここに統合します
47
  class Vectorstore:
@@ -81,7 +129,7 @@ class Vectorstore:
81
  """
82
  Embeds the document chunks using the Cohere API.
83
  """
84
- #print("Embedding document chunks...")
85
 
86
  batch_size = 90
87
  self.docs_len = len(self.docs)
@@ -95,17 +143,32 @@ class Vectorstore:
95
  embedding_types=["float"]
96
  ).embeddings.float
97
  self.docs_embs.extend(docs_embs_batch)
98
- #print(docs_embs_batch)
99
 
100
- def index(self):
 
 
 
101
  print("Indexing document chunks...")
 
102
  self.idx = hnswlib.Index(space="ip", dim=1024)
103
- self.idx.init_index(max_elements=self.docs_len, ef_construction=512, M=128) # Increased M
104
  self.idx.add_items(self.docs_embs, list(range(len(self.docs_embs))))
105
- #print(f"Indexing complete with {self.idx.get_current_count()} document chunks.")
106
 
107
- def retrieve(self, query: str):
108
- print("Retrieving document chunks...")
 
 
 
 
 
 
 
 
 
 
 
 
109
  query_emb = co_embed.embed(
110
  texts=[query],
111
  model="embed-multilingual-v3.0",
@@ -113,10 +176,24 @@ class Vectorstore:
113
  embedding_types=["float"]
114
  ).embeddings.float
115
 
116
- self.idx.set_ef(100) # Set higher ef for query
117
- doc_ids = self.idx.knn_query(query_emb, k=self.retrieve_top_k)[0] # Retrieve IDs safely
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
- docs_retrieved = [self.docs[doc_id]["data"] for doc_id in doc_ids]
120
  return docs_retrieved
121
 
122
  if not vectored == "vectored":
@@ -130,8 +207,8 @@ vectorstore = Vectorstore(raw_documents)
130
  # Gradioの関数
131
  def search(query):
132
  results = vectorstore.retrieve(query)
133
- #decoded_data = decode_text(results)
134
- #print("decoded_data:",decoded_data)
135
  return "\n\n".join([f"**Title**: {r['title']}\n**Text**: {r['text']}\n**URL**: {r['url']}" for r in decoded_data])
136
 
137
  # Gradioインターフェース
 
15
 
16
  vectored = None
17
 
18
+ dataid = requests.get("https://www.ryhintl.com/dbjson/getjson?sqlcmd=select * from company_matters")
19
 
20
+ data_str = dataid.content.decode('utf-8')
21
+ data = json.loads(data_str)
22
 
 
23
  '''def decode_text(data):
24
+ for item in data:
25
+ item['text'] = item['text'].encode('latin1').decode('utf-8') # latin1でエンコード、UTF-8でデコード
26
+ return data'''
27
+
28
+ # デコード関数
29
+ def decode_text(data):
30
  for item in data:
31
  try:
32
  # latin1 でデコードし、utf-8に変換
 
34
  except UnicodeDecodeError as e:
35
  print(f"エラー: {e}")
36
  item['text'] = "[デコード失敗]"
37
+ return data
38
+
39
+ '''def upload_html_to_server(file_path, url):
40
+ with open(file_path, "rb") as file:
41
+ # ファイルをアップロードする
42
+ response = requests.post(url, files={"file": file})
43
+ return response.status_code, response.text
44
+
45
+ def upload_text_to_server(text, url):
46
+ # テキストを送信
47
+ response = requests.post(url, data={"content": text})
48
+ return response.status_code, response.text
49
+
50
+ # 自然言語にする関数
51
+ def json_to_text(data):
52
+ final_context = ""
53
+ for item in data:
54
+ text = (
55
+ f"タイトル: {item['caption']}\n"
56
+ f"表題: {item['title']}\n"
57
+ f"詳細: {item['content']}\n"
58
+ f"日付: {item['date']}\n"
59
+ )
60
+ final_context += text + "\n" # 結果を連結
61
+
62
+ #last_content = "<html>\n<head>\n<title>会議議事録</title>\n</head>\n<body>\n"+final_context+"</body>\n</html>"
63
+ last_content = final_context
64
+
65
+ file_name = "output_cm.html"
66
+ with open(file_name, "w", encoding="utf-8") as file:
67
+ file.write(last_content)
68
+
69
+ # サーバーURLを指定
70
+ server_url = "https://www.ryhintl.com/company_matters.php" # 実際のアップロード先のURLに置き換えてね
71
+
72
+ # アップロード実行
73
+ status, response_text = upload_text_to_server(last_content, server_url)
74
+ print(f"アップロードのステータス: {status}")
75
+ print(f"レスポンス: {response_text}")
76
+
77
+ return final_context
78
+
79
+ # 実行例
80
+ result = json_to_text(data)
81
+
82
+ '''
83
+
84
+ raw_documents = [
85
  {"title": "バグダードの戦い", "url": "https://ja.wikipedia.org/wiki/バグダードの戦い"},
86
  {"title": "2006年トリノオリンピック", "url": "https://ja.wikipedia.org/wiki/2006年トリノオリンピック"},
87
  {"title": "ドレッドノート_(戦艦)", "url": "https://ja.wikipedia.org/wiki/ドレッドノート_(戦艦)"},
88
+ {"title": "えひめ丸事故", "url": "https://ja.wikipedia.org/wiki/えひめ丸事故"},
89
+ {"title": "会議議事録", "url": "https://www.ryhintl.com/reqfiles/company_matters/cm_output.html"}
90
+ ]
91
 
92
+ #raw_documents = [{"title": "会議議事録", "url": "https://www.ryhintl.com/reqfiles/company_matters/cm_output.html"}]
93
 
94
  # あなたのクラスとロジックをここに統合します
95
  class Vectorstore:
 
129
  """
130
  Embeds the document chunks using the Cohere API.
131
  """
132
+ print("Embedding document chunks...")
133
 
134
  batch_size = 90
135
  self.docs_len = len(self.docs)
 
143
  embedding_types=["float"]
144
  ).embeddings.float
145
  self.docs_embs.extend(docs_embs_batch)
146
+ print(docs_embs_batch)
147
 
148
+ def index(self) -> None:
149
+ """
150
+ Indexes the document chunks for efficient retrieval.
151
+ """
152
  print("Indexing document chunks...")
153
+
154
  self.idx = hnswlib.Index(space="ip", dim=1024)
155
+ self.idx.init_index(max_elements=self.docs_len, ef_construction=512, M=64)
156
  self.idx.add_items(self.docs_embs, list(range(len(self.docs_embs))))
 
157
 
158
+ print(f"Indexing complete with {self.idx.get_current_count()} document chunks.")
159
+
160
+ def retrieve(self, query: str) -> List[Dict[str, str]]:
161
+ """
162
+ Retrieves document chunks based on the given query.
163
+
164
+ Parameters:
165
+ query (str): The query to retrieve document chunks for.
166
+
167
+ Returns:
168
+ List[Dict[str, str]]: A list of dictionaries representing the retrieved document chunks, with 'title', 'text', and 'url' keys.
169
+ """
170
+
171
+ # Dense retrieval
172
  query_emb = co_embed.embed(
173
  texts=[query],
174
  model="embed-multilingual-v3.0",
 
176
  embedding_types=["float"]
177
  ).embeddings.float
178
 
179
+ doc_ids = self.idx.knn_query(query_emb, k=self.retrieve_top_k)[0][0]
180
+
181
+ # Reranking
182
+ docs_to_rerank = [self.docs[doc_id]["data"] for doc_id in doc_ids]
183
+ yaml_docs = [yaml.dump(doc, sort_keys=False) for doc in docs_to_rerank]
184
+ rerank_results = co_rerank.rerank(
185
+ query=query,
186
+ documents=yaml_docs,
187
+ model="rerank-v3.5", # Pass a dummy string
188
+ top_n=self.rerank_top_k
189
+ )
190
+
191
+ doc_ids_reranked = [doc_ids[result.index] for result in rerank_results.results]
192
+
193
+ docs_retrieved = []
194
+ for doc_id in doc_ids_reranked:
195
+ docs_retrieved.append(self.docs[doc_id]["data"])
196
 
 
197
  return docs_retrieved
198
 
199
  if not vectored == "vectored":
 
207
  # Gradioの関数
208
  def search(query):
209
  results = vectorstore.retrieve(query)
210
+ decoded_data = decode_text(results)
211
+ print("decoded_data:",decoded_data)
212
  return "\n\n".join([f"**Title**: {r['title']}\n**Text**: {r['text']}\n**URL**: {r['url']}" for r in decoded_data])
213
 
214
  # Gradioインターフェース