import gradio as gr import pandas as pd import tiktoken import pandas as pd import time import spacy from spacy.lang.en.stop_words import STOP_WORDS from string import punctuation from collections import Counter from heapq import nlargest import nltk import numpy as np from tqdm import tqdm from sentence_transformers import SentenceTransformer, util from sentence_transformers import SentenceTransformer, CrossEncoder, util import gzip import os import torch import re from openai.embeddings_utils import get_embedding, cosine_similarity import os df = pd.read_pickle('entire_data.pkl') #to load 123.pkl back to the dataframe df model = SentenceTransformer('all-mpnet-base-v2') def remove_html_tags(text): clean = re.compile('<.*?>') return re.sub(clean, '', text) df['content'] = df.content.apply(lambda x: remove_html_tags(x)) df['summary_html'] = df.summary_html.apply(lambda x: remove_html_tags(x)) #testing new code session_prompt = """ A bot that is open to discussions about different cultural, philosophical and political exchanges. I will use do different analysis to the articles provided to me. Stay truthful and if you weren't provided any resources give your oppinion only.""" def new_ask(user_input): response = openai.ChatCompletion.create(model ="gpt-3.5-turbo", messages = [{'role': 'system', 'content': session_prompt},{'role': 'user', 'content': user_input}], temperature = 0 ) # print(response) return response['choices'][0]['message']['content'] def search(query): n = 10 query_embedding = model.encode(query) df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768,-1))) results = (df.sort_values("similarity", ascending=False).head(n)) r_groupby = pd.DataFrame(results.groupby(['title','url','keywords','summary_html']).similarity.max()) #results = results[['title','url','keywords','summary_html']].drop_duplicates() results = r_groupby.reset_index() results = results.sort_values("similarity", ascending=False) tier_1 = [] tier_2 = [] for r in results.index: if results.similarity[r][0] > 0.5: tier_1.append( { "title":results.title[r], "url":results.url[r], "score": str(results.similarity[r][0]), "summary": results.summary_html[r][:200], "keywords": results.keywords[r] } ) elif results.similarity[r][0] > 0.4: tier_2.append( { "title":results.title[r], "url":results.url[r], "score": str(results.similarity[r][0]), "summary": results.summary_html[r][:200], "keywords": results.keywords[r] } ) print(tier_1) print(tier_2) ln = "\n" prefix = f"tier 1:\n{ln.join([x['title'] for x in tier_1])}" print(prefix) answer = new_ask(f"Answer the following query by giving arguments from the different arguments provided below. Make sure to quote the article used if the argument corrseponds to the query: Query: {query} Articles {ln.join([x['title'] + ': ' + x['summary'] for i, x in enumerate(tier_1)])}\nUse careful reasoning to explain your answer and give your conclusion about this.") if len(tier_2): suffix = f"tier 2:\n{ln.join([x['title'] for x in tier_2])}" related_questions = new_ask(f"Give general questions related the following articles: {ln.join([str(i) + ' ' + x['summary'] for i, x in enumerate(tier_2)])}") return f"{answer}\n\nRelated Questions:\n{related_questions}" return f"{answer}" def greet(query): bm25 = search(query) return bm25 examples = [ ["Climate Change Challenges in Europe"], ["Philosophy in the world of Minimalism"], ["Hate Speech vs Freedom of Speech"], ["The importance of values and reflection"] ] demo = gr.Interface(fn=greet, title="cicero-interactive-qa", outputs = "text",inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),examples=examples) demo.launch(share = True, debug = True)