Rams901 commited on
Commit
c867eb4
·
1 Parent(s): d56f213

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -10
app.py CHANGED
@@ -44,14 +44,30 @@ def add_text(history, text):
44
 
45
  def retrieve_thoughts(query, ):
46
  # print(db.similarity_search_with_score(query = query, k = k, fetch_k = k*10))
47
- docs = db.similarity_search_with_score(query = query, k = 1500, fetch_k = len(db.index_to_docstore_id.values()))
48
-
 
 
 
49
  # TO-DO: What if user query doesn't match what we provide as documents
 
 
 
 
 
 
 
50
 
51
- tier_1 = [doc[0] for doc in docs if ((doc[1] < 1))][:5]
52
- tier_2 = [doc[0] for doc in docs if ((doc[1] > 0.7)*(doc[1] < 1.5))][10:15]
 
53
 
54
- return {'tier 1':tier_1, 'tier 2': tier_2}
 
 
 
 
 
55
 
56
  def qa_retrieve(query,):
57
 
@@ -72,11 +88,10 @@ def qa_retrieve(query,):
72
  tier_1 = thoughts['tier 1']
73
  tier_2 = thoughts['tier 2']
74
 
75
- reference = [{'id': f'{i+1}','website': extract_website_name(thought.metadata['url']), 'url': thought.metadata['url'], } for i, thought in enumerate(tier_1)]
76
-
77
- tier_1 = [f"[{i+1}] title: {thought.metadata['title']}\n Content: {thought.page_content}" for i, thought in enumerate(tier_1)]
78
- tier_2 = [f"title: {thought.metadata['title']}\n Content: {thought.page_content}" for thought in tier_2]
79
-
80
  print(f"QUERY: {query}\nTIER 1: {tier_1}\nTIER2: {tier_2}")
81
  # print(f"DOCS RETRIEVED: {mp_docs.values}")
82
 
 
44
 
45
  def retrieve_thoughts(query, ):
46
  # print(db.similarity_search_with_score(query = query, k = k, fetch_k = k*10))
47
+ docs_with_score = db.similarity_search_with_score(query = query, k = 1500, fetch_k = len(db.index_to_docstore_id.values()))
48
+ df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
49
+ df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
50
+ df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)
51
+
52
  # TO-DO: What if user query doesn't match what we provide as documents
53
+
54
+ tier_1 = df[df['score'] < 0.7]
55
+ tier_2 = df[(df['score'] < 0.95) * (df["score"] > 0.7)]
56
+
57
+ chunks_1 = tier_1.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
58
+ tier_1_adjusted = tier_1.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
59
+ tier_1_adjusted['content'] = chunks_1
60
 
61
+ chunks_2 = tier_2.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
62
+ tier_2_adjusted = tier_2.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
63
+ tier_2_adjusted['content'] = chunks_2
64
 
65
+
66
+
67
+ # tier_1 = [doc[0] for doc in docs if ((doc[1] < 1))][:5]
68
+ # tier_2 = [doc[0] for doc in docs if ((doc[1] > 0.7)*(doc[1] < 1.5))][10:15]
69
+
70
+ return {'tier 1':tier_1_adjusted.loc[:5], 'tier 2': tier_2.loc[:5]}
71
 
72
  def qa_retrieve(query,):
73
 
 
88
  tier_1 = thoughts['tier 1']
89
  tier_2 = thoughts['tier 2']
90
 
91
+ reference = tier_1_adjusted[['ref', 'url', 'title']].to_dict('records')
92
+
93
+ tier_1 = list(tier_1.apply(lambda x: f"[{int(x['ref'])+1}] title: {x['title']}\n Content: {x.content}", axis = 1).values)
94
+ tier_2 = list(tier_2.apply(lambda x: f"title: {x['title']}\n Content: {x.content}", axis = 1).values)
 
95
  print(f"QUERY: {query}\nTIER 1: {tier_1}\nTIER2: {tier_2}")
96
  # print(f"DOCS RETRIEVED: {mp_docs.values}")
97