Spaces:

honeybansal23
/

nextAnalytics

Runtime error

App Files Files Community

nextAnalytics / test.py

honeybansal23

fixed code bugs largely

bf4f857 over 1 year ago

raw

history blame

14 kB

	import asyncio
	import asyncpraw
	import pandas as pd
	import json
	import ast

	from reddit.reddit_sentiment_analysis import SentimentAnalysis
	from reddit.scraping import getPostComments
	# Load the CSV file
	# df = pd.read_csv('posts_data_1732297356855788.csv')

	# # Convert the 'comments' column to a list
	# comments_list = []
	# for i in df['comments']:
	# # json_data= ast.literal_eval(i)
	# comments_list.append(i)
	# print(comments_list[1][0:100])
	# print(df['comments'])
	# # # print("comments_list",len())
	# # # Recursive function to count non-empty replies
	# # def count_non_empty_replies(comments):
	# # count = 1
	# # for comment in comments:
	# # if comment.get("replies"):
	# # count += 1 # Increment for this non-empty replies list
	# # count += count_non_empty_replies(comment["replies"]) # Recursively count nested replies
	# # return count

	# # # Example usage
	# # # total_non_empty_replies = count_non_empty_replies(comments_list[0]['comments'][0]['replies'])
	# # # print("Total non-empty replies:", total_non_empty_replies)

	# # # Sample data structure
	# # # comments_list = [
	# # # {
	# # # "comments": [
	# # # # Each post contains a list of comments with nested replies
	# # # ]
	# # # }
	# # # # More comments_list here
	# # # ]

	# # # Recursive function to limit replies in a comment tree
	# # def limit_replies(comment, reply_limit=3):
	# # limited_comment = {
	# # "user": comment["user"],
	# # "comment": comment["comment"],
	# # "replies": []
	# # }
	# # if reply_limit == 0:
	# # return limited_comment

	# # # Get up to reply_limit replies, recursively applying the function
	# # if "replies" in comment:
	# # for reply in comment["replies"][:reply_limit-1]:
	# # limited_comment["replies"].append(limit_replies(reply, reply_limit-2))

	# # return limited_comment

	# # # Function to process each post, extracting 10 comments with limited replies
	# # def get_limited_comments(comments_list, comment_limit=10, reply_limit=7):
	# # limited_comments_list = []

	# # for post in comments_list:
	# # limited_post = {"comments": []}

	# # # Get up to comment_limit comments for each post
	# # for comment in post["comments"][:comment_limit]:
	# # limited_comment = limit_replies(comment, reply_limit)
	# # limited_post["comments"].append(limited_comment)

	# # limited_comments_list.append(limited_post)

	# # return limited_comments_list

	# # Example usage
	# # limited_comments_data = get_limited_comments(comments_list)
	# # total_non_empty_replies = count_non_empty_replies(limited_comments_data[0]['comments'][0]['replies'])


	# # Save the list to a JSON file
	# # with open('comments2.json', 'w') as json_file:
	# # json.dump(comments_list, json_file,indent=4)

	# # # from reddit.scraping import getPostComments


	# # # getPostComments(fileName="posts_data_1732105228633815.csv")
	# # import time

	# # from reddit.reddit_sentiment_analysis import SentimentAnalysis

	# # # Create an instance of the SentimentAnalysis class
	# # sentiment_instance = SentimentAnalysis()

	# # # Record the start time
	# # start_time = time.time()

	# # # Call the method to generate sentiment and emotion
	# # sentiment_instance.generate_sentiment_and_emotion_from_data(fileName='posts_data.csv')

	# # # Record the end time
	# # end_time = time.time()

	# # # Calculate and print the processing time
	# # process_time = end_time - start_time
	# # print(f"Processing time: {process_time:.2f} seconds")
	# # from reddit.reddit_pain_point_analysis import pain_point_analysis


	# # pain_point_analysis(user_query="artificial intelligence applications in skincare and cosmetic industry",fileName="file_with_sentiment.csv")

	# # import google.generativeai as genai
	# # genai.configure(api_key='AIzaSyBtHE4Bg2ERWsKeGLxGPOSmtZeWRD6nNr0')
	# # model = genai.GenerativeModel("gemini-1.5-flash")

	# # generation_config = genai.GenerationConfig(response_mime_type="application/json")
	# # response = model.generate_content("skin care ai ", generation_config=generation_config) # Adjust if the library supports async
	# # data = response.text
	# # print(data)

	# '''
	# Only Scraping related code.
	# '''
	# from selenium import webdriver
	# from selenium.webdriver.common.action_chains import ActionChains
	# from selenium.webdriver.common.by import By
	# from selenium.webdriver.firefox.options import Options as FirefoxOptions
	# import time
	# from fake_headers import Headers
	# import pandas as pd
	# import praw
	# # from reddit_call import sentence_model
	# import re
	# # # Set up WebDriver
	# header = Headers().generate()["User-Agent"]
	# proxy=None
	# browser_option = FirefoxOptions()
	# browser_option.add_argument("--no-sandbox")
	# browser_option.add_argument("--disable-dev-shm-usage")
	# browser_option.add_argument("--ignore-certificate-errors")
	# browser_option.add_argument("--disable-gpu")
	# browser_option.add_argument("--log-level=3")
	# browser_option.add_argument("--disable-notifications")
	# browser_option.add_argument("--disable-popup-blocking")
	# browser_option.add_argument("--user-agent={}".format(header))
	# if proxy is not None:
	# browser_option.add_argument("--proxy-server=%s" % proxy)

	# # For Hiding Browser
	# browser_option.add_argument("--headless")

	# driver = webdriver.Firefox(options=browser_option)
	# actions = ActionChains(driver)
	# reddit = praw.Reddit(
	# client_id="yjGfys3QZPpdCpNZl25Kig",
	# client_secret="dqoc8LrQBQhB_IgjV-lKyrD9lBPftg",
	# password="&honeyB90",
	# user_agent="Curious",
	# username="Final-Difference7055",
	# )
	# fileName="posts_data_1732244765294548.csv"
	# data= pd.DataFrame(pd.read_csv(fileName))
	# data["comments"]=""
	# for i in range(len(data)):
	# # comment_data_sub=[]
	# submission = reddit.submission(url=data.iloc[i]['url'])

	# # Fetch and process comments
	# submission.comments.replace_more(limit=2) # Use limit=0 to get all comments
	# comments_data = []

	# # Function to process a comment and its replies
	# # Seed with top-level comments
	# comment_queue = list(submission.comments)
	# comment_count=0
	# threshold=20
	# while comment_queue:
	# if(comment_count>=threshold):
	# break
	# comment = comment_queue.pop(0)
	# comment_data = process_comment(comment,reply_limit=2 if isForCompetitorAnalysis else 3) # Process each comment
	# comments_data.append(comment_data)
	# comment_count+=1
	# # Now, structure the data into the desired JSON format
	# json_output = {
	# "comments": comments_data
	# }
	# subset_data = data.iloc[i].copy()

	# # Modify the subset
	# subset_data['comments'] = json_output

	# # Assign back if needed
	# data.iloc[i] = subset_data
	# # Remove rows where 'comments' is an empty string
	# data = data[data['comments'] != ""]
	# data["descriptions"] = data["url"].apply(getSearchPostDescription)
	# data.to_csv(fileName, index=False)
	# print("Data saved to",fileName)

	# from reddit.reddit_competitor_analysis import getCompetitorNamesFromReddit
	# from reddit.api_keys import api_key,api_key2, api_key3
	# import google.generativeai as genai
	# genai.configure(api_key=api_key3)
	# getCompetitorNamesFromReddit(user_query='AI powered personalized skin care recommendations',isSolo=True,fileName='posts_data_1732244547776200.csv',last_chat_session=None)


	# senti = SentimentAnalysis()
	# senti.generate_sentiment_and_emotion_from_data(fileName='posts_data_1732297356855788.csv')
	# import asyncio

	# async def fetchComments():
	# # async def fetchComments():
	# await getPostComments(fileName='posts_data_1732377539625804.csv')

	# # await asyncio.wait_for(asyncio.to_thread(getPostComments, fileName='posts_data_1732377539625804.csv'))

	# # Run the fetchComments function
	# if __name__ == "__main__":
	# asyncio.run(fetchComments())

	# import asyncio

	# async def long_running_task():
	# await asyncio.sleep(5) # Simulate a long-running task
	# print("hello world")

	# async def main():
	# try:
	# await asyncio.wait_for(long_running_task(), timeout=500) # Set timeout to 2 seconds
	# except asyncio.TimeoutError:
	# print("The task timed out!")

	# asyncio.run(main())

	# async def fetch_comments():
	# reddit = asyncpraw.Reddit(
	# client_id="yjGfys3QZPpdCpNZl25Kig",
	# client_secret="dqoc8LrQBQhB_IgjV-lKyrD9lBPftg",
	# password="&honeyB90",
	# user_agent="Curious",
	# username="Final-Difference7055",
	# )


	# subreddit = await reddit.subreddit("python")

	# async for submission in subreddit.new(limit=2):
	# print(f"Post: {submission.title}")

	# # Load the submission's comments
	# await submission.load()

	# # Iterate over the comments
	# async for comment in submission.comments:
	# if isinstance(comment, asyncpraw.models.Comment):
	# print(f" - {comment.body}")

	# # Close the Reddit client
	# await reddit.close()

	# asyncio.run(fetch_comments())

	# asyncio.run(getPostComments(file_name='posts_data_1732377539625804.csv'))

	# import asyncio
	# import heapq
	# import time
	# import pandas as pd
	# import json
	# from asyncpraw.models import Comment
	# from threading import Lock


	# class PriorityQueue:
	# def __init__(self):
	# self.heap = []
	# self.lock = Lock()

	# def push(self, priority, timestamp, query):
	# with self.lock:
	# heapq.heappush(self.heap, (priority, timestamp, query))

	# def pop(self):
	# with self.lock:
	# return heapq.heappop(self.heap)

	# def is_empty(self):
	# with self.lock:
	# return len(self.heap) == 0


	# async def process_comment(comment, reply_limit):
	# """
	# Recursively process a comment and its replies up to the specified reply limit.
	# """
	# comment_data = {
	# "user": comment.author.name if comment.author else "Unknown",
	# "comment": comment.body,
	# "replies": []
	# }

	# if hasattr(comment, "replies") and comment.replies:
	# for reply in comment.replies:
	# if reply_limit == 0:
	# break
	# reply_data = await process_comment(reply, reply_limit=reply_limit - 1)
	# comment_data["replies"].append(reply_data)

	# return comment_data


	# async def fetch_submission(queue, reddit, data, exit_event, output_file):
	# """
	# Fetch Reddit submissions from the queue and process their comments.
	# Save the output to the provided file.
	# """
	# while not exit_event.is_set():
	# if not queue.is_empty():
	# _, _, (url, index) = queue.pop()
	# print(f"Fetching comments for: {url}")

	# try:
	# submission = await asyncio.wait_for(reddit.submission(url=url), timeout=30)
	# await submission.comments.replace_more(limit=2)
	# await submission.load()


	# # Process comments
	# comments_data = []
	# comment_queue = list(submission.comments)
	# comment_count = 0
	# threshold = 40 # Set your desired threshold

	# while comment_queue:
	# if comment_count >= threshold:
	# break
	# comment = comment_queue.pop(0)
	# if isinstance(comment, Comment):
	# comment_data = await process_comment(comment, reply_limit=3)
	# comments_data.append(comment_data)
	# comment_count += 1

	# print(f"Processed comments for: {url}")
	# # Save comments data to the dataframe
	# data.at[index, "comments"] = json.dumps(comments_data) # Save as JSON string

	# except asyncio.TimeoutError:
	# print(f"Timeout while processing: {url}")
	# except Exception as e:
	# print(f"Error processing {url}: {e}")

	# # Save the data to the output file periodically
	# data.to_csv(output_file, index=False)
	# else:
	# await asyncio.sleep(1)


	# async def load_urls_to_queue(queue, file_name, data, exit_event):
	# """
	# Load URLs from a CSV file into the processing queue.
	# """
	# for index, row in data.iterrows():
	# queue.push(0, time.time(), (row['url'], index)) # Push each URL and index into the queue

	# print("All URLs loaded into the queue.")
	# while not exit_event.is_set():
	# await asyncio.sleep(1)


	# async def main(input_file, output_file):
	# reddit = asyncpraw.Reddit(
	# client_id="yjGfys3QZPpdCpNZl25Kig",
	# client_secret="dqoc8LrQBQhB_IgjV-lKyrD9lBPftg",
	# password="&honeyB90",
	# user_agent="Curious",
	# username="Final-Difference7055",
	# )
	# """
	# Main function to coordinate queue loading and submission processing.
	# """
	# queue = PriorityQueue()
	# exit_event = asyncio.Event()

	# # Load the input file
	# data = pd.read_csv(input_file)
	# data["comments"] = "" # Add a new column for storing comments

	# # Start tasks
	# loader_task = asyncio.create_task(load_urls_to_queue(queue, input_file, data, exit_event))
	# processor_task = asyncio.create_task(fetch_submission(queue, reddit, data, exit_event, output_file))

	# try:
	# await asyncio.gather(loader_task, processor_task)
	# except KeyboardInterrupt:
	# print("Exiting... Setting exit event.")
	# exit_event.set()
	# await asyncio.gather(loader_task, processor_task)


	# if __name__ == "__main__":
	# import asyncpraw



	# # Provide your input and output file paths
	# input_file = "posts_data_1732377539625804.csv" # Input file with a column 'url'
	# output_file = "output_file.csv" # Output file to save results

	# asyncio.run(main(input_file, output_file))