Cicero-QA-api-dev

Runtime error

App Files Files Community

Rams901 commited on Aug 17, 2023

Commit

c867eb4

1 Parent(s): d56f213

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -10

app.py CHANGED Viewed

@@ -44,14 +44,30 @@ def add_text(history, text):
 def retrieve_thoughts(query, ):
   # print(db.similarity_search_with_score(query = query, k = k, fetch_k = k*10))
-  docs = db.similarity_search_with_score(query = query, k = 1500, fetch_k = len(db.index_to_docstore_id.values()))
   # TO-DO: What if user query doesn't match what we provide as documents
-  tier_1 = [doc[0]  for doc in docs if ((doc[1] < 1))][:5]
-  tier_2 = [doc[0]  for doc in docs if ((doc[1] > 0.7)*(doc[1] < 1.5))][10:15]
-  return {'tier 1':tier_1, 'tier 2': tier_2}
 def qa_retrieve(query,):
@@ -72,11 +88,10 @@ def qa_retrieve(query,):
     tier_1 = thoughts['tier 1']
     tier_2 = thoughts['tier 2']
-    reference = [{'id': f'{i+1}','website': extract_website_name(thought.metadata['url']), 'url': thought.metadata['url'], } for i, thought in enumerate(tier_1)]
-    tier_1 = [f"[{i+1}] title: {thought.metadata['title']}\n Content: {thought.page_content}" for i, thought in enumerate(tier_1)]
-    tier_2 = [f"title: {thought.metadata['title']}\n Content: {thought.page_content}" for thought in tier_2]
     print(f"QUERY: {query}\nTIER 1: {tier_1}\nTIER2: {tier_2}")
     # print(f"DOCS RETRIEVED: {mp_docs.values}")

 def retrieve_thoughts(query, ):
   # print(db.similarity_search_with_score(query = query, k = k, fetch_k = k*10))
+    docs_with_score = db.similarity_search_with_score(query = query, k = 1500, fetch_k = len(db.index_to_docstore_id.values()))
+    df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
+    df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
+    df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)
   # TO-DO: What if user query doesn't match what we provide as documents
+    tier_1 = df[df['score'] < 0.7]
+    tier_2 = df[(df['score'] < 0.95) * (df["score"] > 0.7)]
+    chunks_1 = tier_1.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
+    tier_1_adjusted = tier_1.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
+    tier_1_adjusted['content'] = chunks_1
+    chunks_2 = tier_2.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
+    tier_2_adjusted = tier_2.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
+    tier_2_adjusted['content'] = chunks_2
+  # tier_1 = [doc[0]  for doc in docs if ((doc[1] < 1))][:5]
+  # tier_2 = [doc[0]  for doc in docs if ((doc[1] > 0.7)*(doc[1] < 1.5))][10:15]
+  return {'tier 1':tier_1_adjusted.loc[:5], 'tier 2': tier_2.loc[:5]}
 def qa_retrieve(query,):
     tier_1 = thoughts['tier 1']
     tier_2 = thoughts['tier 2']
+    reference = tier_1_adjusted[['ref', 'url', 'title']].to_dict('records')
+    tier_1 = list(tier_1.apply(lambda x: f"[{int(x['ref'])+1}] title: {x['title']}\n Content: {x.content}", axis = 1).values)
+    tier_2 = list(tier_2.apply(lambda x: f"title: {x['title']}\n Content: {x.content}", axis = 1).values)
     print(f"QUERY: {query}\nTIER 1: {tier_1}\nTIER2: {tier_2}")
     # print(f"DOCS RETRIEVED: {mp_docs.values}")