nextAnalytics / reddit /reddit_utils.py
honey234's picture
updated backend
3d00f61
import time
import pandas as pd
from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2",cache_folder=r"C:\Users\HP\.cache\huggingface\hub\models--sentence-transformers--all-MiniLM-L6-v2")
def get_microseconds_list(length=3):
# Get the current time in microseconds
microseconds = int(time.time() * 1_000_000)
# Create a list with three microseconds
return [microseconds + i for i in range(length)]
def topic_sort(path1,query, path2='', path3='',isForCompetitorAnalysis=False):
# sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2",cache_folder=cache_dir)
if isForCompetitorAnalysis==True:
df=pd.read_csv(path1)
else:
df0 = pd.read_csv(path1)
df1 = pd.read_csv(path2)
df2 = pd.read_csv(path3)
df = pd.concat([df0, df1, df2],axis=0)
df = df.drop_duplicates("title")
df = df.reset_index(drop=True)
df = df.drop("index", axis = 1)
title = df["title"]
sentences = [query] + list(title)
embeddings = sentence_model.encode(sentences)
similarities = sentence_model.similarity(embeddings[0], embeddings)
print(len(similarities[0]))
df["similarity"] = similarities[0][1:]
df = df.sort_values(by='similarity', ascending=False)
df = df.reset_index(drop=True)
df = df.head(30)
df = df.sort_values(by=['comment_count','votes_count'], ascending=False)
df = df.reset_index(drop=True)
df = df.head(18)
return df
reddit_services_names= ['Pain point analysis','Competitor analysis']