File size: 4,251 Bytes
b2b5a46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35e8902
 
 
 
 
 
 
 
 
 
 
b2b5a46
 
 
 
 
 
 
 
 
 
 
35e8902
 
b2b5a46
35e8902
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2b5a46
 
 
 
 
 
 
 
 
 
 
 
 
35e8902
 
b2b5a46
35e8902
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
import pandas as pd
import tiktoken
import pandas as pd
import time
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
import nltk
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import gzip
import os
import torch
import re

from openai.embeddings_utils import get_embedding, cosine_similarity
import os



df = pd.read_pickle('entire_data.pkl') #to load 123.pkl back to the dataframe df
model = SentenceTransformer('all-mpnet-base-v2')

def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

df['content'] = df.content.apply(lambda x: remove_html_tags(x))
df['summary_html'] = df.summary_html.apply(lambda x: remove_html_tags(x))
#testing new code
session_prompt = """ A bot that is open to discussions about different cultural, philosophical and political exchanges. I will use do different analysis to the articles provided to me. Stay truthful and if you weren't provided any resources give your oppinion only."""

def new_ask(user_input):
  response = openai.ChatCompletion.create(model ="gpt-3.5-turbo",
                               messages = [{'role': 'system', 'content': session_prompt},{'role': 'user', 'content': user_input}],
                               temperature = 0

  )
  # print(response)
  return response['choices'][0]['message']['content']

def search(query):
    n = 10
    query_embedding = model.encode(query)
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1)))

    results = (df.sort_values("similarity", ascending=False).head(n))
    r_groupby = pd.DataFrame(results.groupby(['title','url','keywords','summary_html']).similarity.max())
    #results = results[['title','url','keywords','summary_html']].drop_duplicates()  
    results = r_groupby.reset_index()
    results = results.sort_values("similarity", ascending=False)
    tier_1 = []
    tier_2 = []
    for r in results.index:

        if results.similarity[r][0] > 0.5:

          tier_1.append(
              {
                "title":results.title[r],
                  "url":results.url[r],
              "score": str(results.similarity[r][0]),
              "summary": results.summary_html[r][:200],
                  "keywords": results.keywords[r]
              }
          )

        elif results.similarity[r][0] > 0.4:
          tier_2.append(
              {
                "title":results.title[r],
                  "url":results.url[r],
              "score": str(results.similarity[r][0]),
              "summary": results.summary_html[r][:200],
                  "keywords": results.keywords[r]
              }
          )
    print(tier_1)
    print(tier_2)
    ln = "\n"
    prefix = f"tier 1:\n{ln.join([x['title'] for x in tier_1])}"
    print(prefix)
    answer = new_ask(f"Answer the following query by giving arguments from the different arguments provided below. Make sure to quote the article used if the argument corrseponds to the query: Query: {query} Articles {ln.join([x['title'] + ': ' + x['summary'] for i, x in enumerate(tier_1)])}\nUse careful reasoning to explain your answer and give your conclusion about this.")
    
    if len(tier_2):
      suffix = f"tier 2:\n{ln.join([x['title'] for x in tier_2])}"
      related_questions = new_ask(f"Give general questions related the following articles: {ln.join([str(i) + ' ' + x['summary'] for i, x in enumerate(tier_2)])}")
      
      return f"{answer}\n\nRelated Questions:\n{related_questions}"
    
    return f"{answer}"

def greet(query):
   
    bm25 = search(query)
    return bm25

examples = [
    ["Climate Change Challenges in Europe"],
    ["Philosophy in the world of Minimalism"],
    ["Hate Speech  vs Freedom of Speech"],
    ["The importance of values and reflection"]
    ]

demo = gr.Interface(fn=greet, title="cicero-interactive-qa", 
                     outputs = "text",inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),examples=examples)

demo.launch(share = True, debug = True)