nextAnalytics / test.py
honeybansal23's picture
fixed code bugs largely
bf4f857
raw
history blame
14 kB
import asyncio
import asyncpraw
import pandas as pd
import json
import ast
from reddit.reddit_sentiment_analysis import SentimentAnalysis
from reddit.scraping import getPostComments
# Load the CSV file
# df = pd.read_csv('posts_data_1732297356855788.csv')
# # Convert the 'comments' column to a list
# comments_list = []
# for i in df['comments']:
# # json_data= ast.literal_eval(i)
# comments_list.append(i)
# print(comments_list[1][0:100])
# print(df['comments'])
# # # print("comments_list",len())
# # # Recursive function to count non-empty replies
# # def count_non_empty_replies(comments):
# # count = 1
# # for comment in comments:
# # if comment.get("replies"):
# # count += 1 # Increment for this non-empty replies list
# # count += count_non_empty_replies(comment["replies"]) # Recursively count nested replies
# # return count
# # # Example usage
# # # total_non_empty_replies = count_non_empty_replies(comments_list[0]['comments'][0]['replies'])
# # # print("Total non-empty replies:", total_non_empty_replies)
# # # Sample data structure
# # # comments_list = [
# # # {
# # # "comments": [
# # # # Each post contains a list of comments with nested replies
# # # ]
# # # }
# # # # More comments_list here
# # # ]
# # # Recursive function to limit replies in a comment tree
# # def limit_replies(comment, reply_limit=3):
# # limited_comment = {
# # "user": comment["user"],
# # "comment": comment["comment"],
# # "replies": []
# # }
# # if reply_limit == 0:
# # return limited_comment
# # # Get up to reply_limit replies, recursively applying the function
# # if "replies" in comment:
# # for reply in comment["replies"][:reply_limit-1]:
# # limited_comment["replies"].append(limit_replies(reply, reply_limit-2))
# # return limited_comment
# # # Function to process each post, extracting 10 comments with limited replies
# # def get_limited_comments(comments_list, comment_limit=10, reply_limit=7):
# # limited_comments_list = []
# # for post in comments_list:
# # limited_post = {"comments": []}
# # # Get up to comment_limit comments for each post
# # for comment in post["comments"][:comment_limit]:
# # limited_comment = limit_replies(comment, reply_limit)
# # limited_post["comments"].append(limited_comment)
# # limited_comments_list.append(limited_post)
# # return limited_comments_list
# # Example usage
# # limited_comments_data = get_limited_comments(comments_list)
# # total_non_empty_replies = count_non_empty_replies(limited_comments_data[0]['comments'][0]['replies'])
# # Save the list to a JSON file
# # with open('comments2.json', 'w') as json_file:
# # json.dump(comments_list, json_file,indent=4)
# # # from reddit.scraping import getPostComments
# # # getPostComments(fileName="posts_data_1732105228633815.csv")
# # import time
# # from reddit.reddit_sentiment_analysis import SentimentAnalysis
# # # Create an instance of the SentimentAnalysis class
# # sentiment_instance = SentimentAnalysis()
# # # Record the start time
# # start_time = time.time()
# # # Call the method to generate sentiment and emotion
# # sentiment_instance.generate_sentiment_and_emotion_from_data(fileName='posts_data.csv')
# # # Record the end time
# # end_time = time.time()
# # # Calculate and print the processing time
# # process_time = end_time - start_time
# # print(f"Processing time: {process_time:.2f} seconds")
# # from reddit.reddit_pain_point_analysis import pain_point_analysis
# # pain_point_analysis(user_query="artificial intelligence applications in skincare and cosmetic industry",fileName="file_with_sentiment.csv")
# # import google.generativeai as genai
# # genai.configure(api_key='AIzaSyBtHE4Bg2ERWsKeGLxGPOSmtZeWRD6nNr0')
# # model = genai.GenerativeModel("gemini-1.5-flash")
# # generation_config = genai.GenerationConfig(response_mime_type="application/json")
# # response = model.generate_content("skin care ai ", generation_config=generation_config) # Adjust if the library supports async
# # data = response.text
# # print(data)
# '''
# Only Scraping related code.
# '''
# from selenium import webdriver
# from selenium.webdriver.common.action_chains import ActionChains
# from selenium.webdriver.common.by import By
# from selenium.webdriver.firefox.options import Options as FirefoxOptions
# import time
# from fake_headers import Headers
# import pandas as pd
# import praw
# # from reddit_call import sentence_model
# import re
# # # Set up WebDriver
# header = Headers().generate()["User-Agent"]
# proxy=None
# browser_option = FirefoxOptions()
# browser_option.add_argument("--no-sandbox")
# browser_option.add_argument("--disable-dev-shm-usage")
# browser_option.add_argument("--ignore-certificate-errors")
# browser_option.add_argument("--disable-gpu")
# browser_option.add_argument("--log-level=3")
# browser_option.add_argument("--disable-notifications")
# browser_option.add_argument("--disable-popup-blocking")
# browser_option.add_argument("--user-agent={}".format(header))
# if proxy is not None:
# browser_option.add_argument("--proxy-server=%s" % proxy)
# # For Hiding Browser
# browser_option.add_argument("--headless")
# driver = webdriver.Firefox(options=browser_option)
# actions = ActionChains(driver)
# reddit = praw.Reddit(
# client_id="yjGfys3QZPpdCpNZl25Kig",
# client_secret="dqoc8LrQBQhB_IgjV-lKyrD9lBPftg",
# password="&honeyB90",
# user_agent="Curious",
# username="Final-Difference7055",
# )
# fileName="posts_data_1732244765294548.csv"
# data= pd.DataFrame(pd.read_csv(fileName))
# data["comments"]=""
# for i in range(len(data)):
# # comment_data_sub=[]
# submission = reddit.submission(url=data.iloc[i]['url'])
# # Fetch and process comments
# submission.comments.replace_more(limit=2) # Use limit=0 to get all comments
# comments_data = []
# # Function to process a comment and its replies
# # Seed with top-level comments
# comment_queue = list(submission.comments)
# comment_count=0
# threshold=20
# while comment_queue:
# if(comment_count>=threshold):
# break
# comment = comment_queue.pop(0)
# comment_data = process_comment(comment,reply_limit=2 if isForCompetitorAnalysis else 3) # Process each comment
# comments_data.append(comment_data)
# comment_count+=1
# # Now, structure the data into the desired JSON format
# json_output = {
# "comments": comments_data
# }
# subset_data = data.iloc[i].copy()
# # Modify the subset
# subset_data['comments'] = json_output
# # Assign back if needed
# data.iloc[i] = subset_data
# # Remove rows where 'comments' is an empty string
# data = data[data['comments'] != ""]
# data["descriptions"] = data["url"].apply(getSearchPostDescription)
# data.to_csv(fileName, index=False)
# print("Data saved to",fileName)
# from reddit.reddit_competitor_analysis import getCompetitorNamesFromReddit
# from reddit.api_keys import api_key,api_key2, api_key3
# import google.generativeai as genai
# genai.configure(api_key=api_key3)
# getCompetitorNamesFromReddit(user_query='AI powered personalized skin care recommendations',isSolo=True,fileName='posts_data_1732244547776200.csv',last_chat_session=None)
# senti = SentimentAnalysis()
# senti.generate_sentiment_and_emotion_from_data(fileName='posts_data_1732297356855788.csv')
# import asyncio
# async def fetchComments():
# # async def fetchComments():
# await getPostComments(fileName='posts_data_1732377539625804.csv')
# # await asyncio.wait_for(asyncio.to_thread(getPostComments, fileName='posts_data_1732377539625804.csv'))
# # Run the fetchComments function
# if __name__ == "__main__":
# asyncio.run(fetchComments())
# import asyncio
# async def long_running_task():
# await asyncio.sleep(5) # Simulate a long-running task
# print("hello world")
# async def main():
# try:
# await asyncio.wait_for(long_running_task(), timeout=500) # Set timeout to 2 seconds
# except asyncio.TimeoutError:
# print("The task timed out!")
# asyncio.run(main())
# async def fetch_comments():
# reddit = asyncpraw.Reddit(
# client_id="yjGfys3QZPpdCpNZl25Kig",
# client_secret="dqoc8LrQBQhB_IgjV-lKyrD9lBPftg",
# password="&honeyB90",
# user_agent="Curious",
# username="Final-Difference7055",
# )
# subreddit = await reddit.subreddit("python")
# async for submission in subreddit.new(limit=2):
# print(f"Post: {submission.title}")
# # Load the submission's comments
# await submission.load()
# # Iterate over the comments
# async for comment in submission.comments:
# if isinstance(comment, asyncpraw.models.Comment):
# print(f" - {comment.body}")
# # Close the Reddit client
# await reddit.close()
# asyncio.run(fetch_comments())
# asyncio.run(getPostComments(file_name='posts_data_1732377539625804.csv'))
# import asyncio
# import heapq
# import time
# import pandas as pd
# import json
# from asyncpraw.models import Comment
# from threading import Lock
# class PriorityQueue:
# def __init__(self):
# self.heap = []
# self.lock = Lock()
# def push(self, priority, timestamp, query):
# with self.lock:
# heapq.heappush(self.heap, (priority, timestamp, query))
# def pop(self):
# with self.lock:
# return heapq.heappop(self.heap)
# def is_empty(self):
# with self.lock:
# return len(self.heap) == 0
# async def process_comment(comment, reply_limit):
# """
# Recursively process a comment and its replies up to the specified reply limit.
# """
# comment_data = {
# "user": comment.author.name if comment.author else "Unknown",
# "comment": comment.body,
# "replies": []
# }
# if hasattr(comment, "replies") and comment.replies:
# for reply in comment.replies:
# if reply_limit == 0:
# break
# reply_data = await process_comment(reply, reply_limit=reply_limit - 1)
# comment_data["replies"].append(reply_data)
# return comment_data
# async def fetch_submission(queue, reddit, data, exit_event, output_file):
# """
# Fetch Reddit submissions from the queue and process their comments.
# Save the output to the provided file.
# """
# while not exit_event.is_set():
# if not queue.is_empty():
# _, _, (url, index) = queue.pop()
# print(f"Fetching comments for: {url}")
# try:
# submission = await asyncio.wait_for(reddit.submission(url=url), timeout=30)
# await submission.comments.replace_more(limit=2)
# await submission.load()
# # Process comments
# comments_data = []
# comment_queue = list(submission.comments)
# comment_count = 0
# threshold = 40 # Set your desired threshold
# while comment_queue:
# if comment_count >= threshold:
# break
# comment = comment_queue.pop(0)
# if isinstance(comment, Comment):
# comment_data = await process_comment(comment, reply_limit=3)
# comments_data.append(comment_data)
# comment_count += 1
# print(f"Processed comments for: {url}")
# # Save comments data to the dataframe
# data.at[index, "comments"] = json.dumps(comments_data) # Save as JSON string
# except asyncio.TimeoutError:
# print(f"Timeout while processing: {url}")
# except Exception as e:
# print(f"Error processing {url}: {e}")
# # Save the data to the output file periodically
# data.to_csv(output_file, index=False)
# else:
# await asyncio.sleep(1)
# async def load_urls_to_queue(queue, file_name, data, exit_event):
# """
# Load URLs from a CSV file into the processing queue.
# """
# for index, row in data.iterrows():
# queue.push(0, time.time(), (row['url'], index)) # Push each URL and index into the queue
# print("All URLs loaded into the queue.")
# while not exit_event.is_set():
# await asyncio.sleep(1)
# async def main(input_file, output_file):
# reddit = asyncpraw.Reddit(
# client_id="yjGfys3QZPpdCpNZl25Kig",
# client_secret="dqoc8LrQBQhB_IgjV-lKyrD9lBPftg",
# password="&honeyB90",
# user_agent="Curious",
# username="Final-Difference7055",
# )
# """
# Main function to coordinate queue loading and submission processing.
# """
# queue = PriorityQueue()
# exit_event = asyncio.Event()
# # Load the input file
# data = pd.read_csv(input_file)
# data["comments"] = "" # Add a new column for storing comments
# # Start tasks
# loader_task = asyncio.create_task(load_urls_to_queue(queue, input_file, data, exit_event))
# processor_task = asyncio.create_task(fetch_submission(queue, reddit, data, exit_event, output_file))
# try:
# await asyncio.gather(loader_task, processor_task)
# except KeyboardInterrupt:
# print("Exiting... Setting exit event.")
# exit_event.set()
# await asyncio.gather(loader_task, processor_task)
# if __name__ == "__main__":
# import asyncpraw
# # Provide your input and output file paths
# input_file = "posts_data_1732377539625804.csv" # Input file with a column 'url'
# output_file = "output_file.csv" # Output file to save results
# asyncio.run(main(input_file, output_file))