Spaces:

twimbit-ai
/

top_questions

Runtime error

top_questions / utils /GetTopAndRecentQuestions.py

Siddhant

added debug prints

8dddacd unverified over 2 years ago

6.51 kB

	from sentence_transformers import SentenceTransformer, util
	import torch
	import difflib
	from utils.GetDB import GetDB

	postgreSQL_pool = GetDB().get_db_connection()

	embedder = SentenceTransformer('all-MiniLM-L6-v2')


	def get_question():
	# Connect to the PostgreSQL database
	conn = postgreSQL_pool.getconn()

	# Create a cursor object
	cur = conn.cursor()

	# Execute a SELECT query to fetch data from the "users" table
	cur.execute("SELECT question FROM chat_history ORDER BY created_at DESC")

	# Fetch all the results as a list of tuples
	results = cur.fetchall()
	results = [x[0] for x in results]

	# Close the cursor and connection
	# # cur.close()
	postgreSQL_pool.putconn(conn)
	return results


	def count_top_questions(questions_array):
	corpus_embeddings = embedder.encode(questions_array, convert_to_tensor=True)

	top_questions_array = {}

	for question in questions_array:

	query_embedding = embedder.encode([question], convert_to_tensor=True)

	# We use cosine-similarity and torch.topk to find the highest 5 scores
	cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
	top_results = torch.topk(cos_scores, k=100)

	counter = 0

	for score, idx in zip(top_results[0][1:], top_results[1][1:]):
	if score.item() >= 0.8:
	counter += 1

	top_questions_array[question] = counter

	# removing duplicate tuples
	return sorted(top_questions_array.items(), key=lambda x: x[1], reverse=True)[:50]


	def remove_redundancy(redundant_raw_top_asked_questions):
	for raw_top_asked_question in redundant_raw_top_asked_questions:

	for raw_top_asked_question_inner in redundant_raw_top_asked_questions:
	matching_ratio = difflib.SequenceMatcher(None, raw_top_asked_question_inner[0],
	raw_top_asked_question[0]).ratio()

	if 0.7 <= matching_ratio < 1.0:
	redundant_raw_top_asked_questions.remove(raw_top_asked_question_inner)

	return redundant_raw_top_asked_questions


	def remove_greetings(sanitised_questions_array):
	greeting_array = ['hey', 'hi', 'hello', "Hello!",
	"Hi there!",
	"Hey!",
	"Good morning!",
	"Good afternoon!",
	"Good evening!",
	"Howdy!",
	"Greetings!",
	"Nice to see you!",
	"What's up?",
	"Hi!",
	"hiiii",
	"Hello!",
	"Hey!", "How are you?",
	"What is your name?",
	"Where are you from?",
	"What do you do?",
	"How can I help you?",
	"What's the weather like?",
	"Do you have any plans for the weekend?",
	"Have you seen any good movies lately?",
	"What's your favorite food?",
	"What are your hobbies?", "hi, hello"]

	greetings_embeddings = embedder.encode(greeting_array, convert_to_tensor=True)

	for raw_top_asked_question in sanitised_questions_array[:10]:
	query_embedding = embedder.encode([raw_top_asked_question[0]], convert_to_tensor=True)

	cos_scores = util.cos_sim(query_embedding, greetings_embeddings)[0]
	top_results = torch.topk(cos_scores, k=1)

	for score, idx in zip(top_results[0], top_results[1]):
	if score.item() >= 0.87:
	sanitised_questions_array.remove(raw_top_asked_question)

	return sanitised_questions_array


	def final_phase_filtering(raw_first_phase_filtered_questions, limit=20):
	raw_first_phase_filtered_questions = raw_first_phase_filtered_questions[:limit]
	for raw_first_phase_filtered_question in raw_first_phase_filtered_questions:
	for raw_first_phase_filtered_question_inner in raw_first_phase_filtered_questions:
	emb1 = embedder.encode(raw_first_phase_filtered_question[0])
	emb2 = embedder.encode(raw_first_phase_filtered_question_inner[0])

	cos_sim = util.cos_sim(emb1, emb2)

	if 0.85 <= cos_sim.item() < 1.0000001192092896:
	raw_first_phase_filtered_questions.remove(raw_first_phase_filtered_question_inner)

	return raw_first_phase_filtered_questions


	def return_top_question(limit=5):
	questions = get_question()
	print('questions')
	count_top_questions_ = count_top_questions(questions)
	print('count_top_questions_')
	remove_redundancy_ = remove_redundancy(count_top_questions_)
	print('remove_redundancy_')
	remove_greetings_ = remove_greetings(remove_redundancy_)
	print('remove_greetings_')
	final_phase_filtering_ = final_phase_filtering(remove_greetings_)[:limit]
	print('final_phase_filtering_')

	message = 'These are the top questions asked on the ask twimbit/platform by the users:'
	for key, final_phase_filtering__ in enumerate(final_phase_filtering_):
	message = message + '\n {}: '.format(key + 1) + final_phase_filtering__[0]

	return message


	def return_recent_posts(limit=5, strategy='recent'):
	import os
	import requests
	import json

	HASURA_URL = os.environ['HASURA_URL']
	HASURA_ADMIN_SECRET = os.environ['HASURA_ADMIN_SECRET']

	url = HASURA_URL
	body = """query homeFeedQuery($strategy: Strategy, $limit: Int){
	feed(strategy: $strategy, limit: $limit) {
	hits {
	link
	title
	date
	author
	}
	}
	}"""
	variables = {'strategy': strategy, 'limit': limit}
	response = requests.post(url=url, json={'query': body, 'variables': variables}, headers={
	'x-hasura-admin-secret': HASURA_ADMIN_SECRET})

	message = 'These are the recent Articles/Posts on the platform/twimbit website: \n'

	if response.status_code == 200:
	data = json.loads(response.content)
	posts = data.get('data').get('feed').get('hits')

	for key, post in enumerate(posts):
	title = post.get('title')
	link = post.get('link')
	date = post.get('date')
	authors = ','.join(post.get('author'))
	message += 'Post/Article {}:- \n\tPost/Article Title:- {}\n\tPost/Article Link/URL:- {}\n\tPost/Article Publish Date:- {}\n\tPost/Article Author:- {}\n'.format(
	key + 1, title, link, date, authors)

	return message