Spaces:
Runtime error
Runtime error
| import asyncio | |
| import asyncpraw | |
| import pandas as pd | |
| import json | |
| import ast | |
| from reddit.reddit_sentiment_analysis import SentimentAnalysis | |
| from reddit.scraping import getPostComments | |
| # Load the CSV file | |
| # df = pd.read_csv('posts_data_1732297356855788.csv') | |
| # # Convert the 'comments' column to a list | |
| # comments_list = [] | |
| # for i in df['comments']: | |
| # # json_data= ast.literal_eval(i) | |
| # comments_list.append(i) | |
| # print(comments_list[1][0:100]) | |
| # print(df['comments']) | |
| # # # print("comments_list",len()) | |
| # # # Recursive function to count non-empty replies | |
| # # def count_non_empty_replies(comments): | |
| # # count = 1 | |
| # # for comment in comments: | |
| # # if comment.get("replies"): | |
| # # count += 1 # Increment for this non-empty replies list | |
| # # count += count_non_empty_replies(comment["replies"]) # Recursively count nested replies | |
| # # return count | |
| # # # Example usage | |
| # # # total_non_empty_replies = count_non_empty_replies(comments_list[0]['comments'][0]['replies']) | |
| # # # print("Total non-empty replies:", total_non_empty_replies) | |
| # # # Sample data structure | |
| # # # comments_list = [ | |
| # # # { | |
| # # # "comments": [ | |
| # # # # Each post contains a list of comments with nested replies | |
| # # # ] | |
| # # # } | |
| # # # # More comments_list here | |
| # # # ] | |
| # # # Recursive function to limit replies in a comment tree | |
| # # def limit_replies(comment, reply_limit=3): | |
| # # limited_comment = { | |
| # # "user": comment["user"], | |
| # # "comment": comment["comment"], | |
| # # "replies": [] | |
| # # } | |
| # # if reply_limit == 0: | |
| # # return limited_comment | |
| # # # Get up to reply_limit replies, recursively applying the function | |
| # # if "replies" in comment: | |
| # # for reply in comment["replies"][:reply_limit-1]: | |
| # # limited_comment["replies"].append(limit_replies(reply, reply_limit-2)) | |
| # # return limited_comment | |
| # # # Function to process each post, extracting 10 comments with limited replies | |
| # # def get_limited_comments(comments_list, comment_limit=10, reply_limit=7): | |
| # # limited_comments_list = [] | |
| # # for post in comments_list: | |
| # # limited_post = {"comments": []} | |
| # # # Get up to comment_limit comments for each post | |
| # # for comment in post["comments"][:comment_limit]: | |
| # # limited_comment = limit_replies(comment, reply_limit) | |
| # # limited_post["comments"].append(limited_comment) | |
| # # limited_comments_list.append(limited_post) | |
| # # return limited_comments_list | |
| # # Example usage | |
| # # limited_comments_data = get_limited_comments(comments_list) | |
| # # total_non_empty_replies = count_non_empty_replies(limited_comments_data[0]['comments'][0]['replies']) | |
| # # Save the list to a JSON file | |
| # # with open('comments2.json', 'w') as json_file: | |
| # # json.dump(comments_list, json_file,indent=4) | |
| # # # from reddit.scraping import getPostComments | |
| # # # getPostComments(fileName="posts_data_1732105228633815.csv") | |
| # # import time | |
| # # from reddit.reddit_sentiment_analysis import SentimentAnalysis | |
| # # # Create an instance of the SentimentAnalysis class | |
| # # sentiment_instance = SentimentAnalysis() | |
| # # # Record the start time | |
| # # start_time = time.time() | |
| # # # Call the method to generate sentiment and emotion | |
| # # sentiment_instance.generate_sentiment_and_emotion_from_data(fileName='posts_data.csv') | |
| # # # Record the end time | |
| # # end_time = time.time() | |
| # # # Calculate and print the processing time | |
| # # process_time = end_time - start_time | |
| # # print(f"Processing time: {process_time:.2f} seconds") | |
| # # from reddit.reddit_pain_point_analysis import pain_point_analysis | |
| # # pain_point_analysis(user_query="artificial intelligence applications in skincare and cosmetic industry",fileName="file_with_sentiment.csv") | |
| # # import google.generativeai as genai | |
| # # genai.configure(api_key='AIzaSyBtHE4Bg2ERWsKeGLxGPOSmtZeWRD6nNr0') | |
| # # model = genai.GenerativeModel("gemini-1.5-flash") | |
| # # generation_config = genai.GenerationConfig(response_mime_type="application/json") | |
| # # response = model.generate_content("skin care ai ", generation_config=generation_config) # Adjust if the library supports async | |
| # # data = response.text | |
| # # print(data) | |
| # ''' | |
| # Only Scraping related code. | |
| # ''' | |
| # from selenium import webdriver | |
| # from selenium.webdriver.common.action_chains import ActionChains | |
| # from selenium.webdriver.common.by import By | |
| # from selenium.webdriver.firefox.options import Options as FirefoxOptions | |
| # import time | |
| # from fake_headers import Headers | |
| # import pandas as pd | |
| # import praw | |
| # # from reddit_call import sentence_model | |
| # import re | |
| # # # Set up WebDriver | |
| # header = Headers().generate()["User-Agent"] | |
| # proxy=None | |
| # browser_option = FirefoxOptions() | |
| # browser_option.add_argument("--no-sandbox") | |
| # browser_option.add_argument("--disable-dev-shm-usage") | |
| # browser_option.add_argument("--ignore-certificate-errors") | |
| # browser_option.add_argument("--disable-gpu") | |
| # browser_option.add_argument("--log-level=3") | |
| # browser_option.add_argument("--disable-notifications") | |
| # browser_option.add_argument("--disable-popup-blocking") | |
| # browser_option.add_argument("--user-agent={}".format(header)) | |
| # if proxy is not None: | |
| # browser_option.add_argument("--proxy-server=%s" % proxy) | |
| # # For Hiding Browser | |
| # browser_option.add_argument("--headless") | |
| # driver = webdriver.Firefox(options=browser_option) | |
| # actions = ActionChains(driver) | |
| # reddit = praw.Reddit( | |
| # client_id="yjGfys3QZPpdCpNZl25Kig", | |
| # client_secret="dqoc8LrQBQhB_IgjV-lKyrD9lBPftg", | |
| # password="&honeyB90", | |
| # user_agent="Curious", | |
| # username="Final-Difference7055", | |
| # ) | |
| # fileName="posts_data_1732244765294548.csv" | |
| # data= pd.DataFrame(pd.read_csv(fileName)) | |
| # data["comments"]="" | |
| # for i in range(len(data)): | |
| # # comment_data_sub=[] | |
| # submission = reddit.submission(url=data.iloc[i]['url']) | |
| # # Fetch and process comments | |
| # submission.comments.replace_more(limit=2) # Use limit=0 to get all comments | |
| # comments_data = [] | |
| # # Function to process a comment and its replies | |
| # # Seed with top-level comments | |
| # comment_queue = list(submission.comments) | |
| # comment_count=0 | |
| # threshold=20 | |
| # while comment_queue: | |
| # if(comment_count>=threshold): | |
| # break | |
| # comment = comment_queue.pop(0) | |
| # comment_data = process_comment(comment,reply_limit=2 if isForCompetitorAnalysis else 3) # Process each comment | |
| # comments_data.append(comment_data) | |
| # comment_count+=1 | |
| # # Now, structure the data into the desired JSON format | |
| # json_output = { | |
| # "comments": comments_data | |
| # } | |
| # subset_data = data.iloc[i].copy() | |
| # # Modify the subset | |
| # subset_data['comments'] = json_output | |
| # # Assign back if needed | |
| # data.iloc[i] = subset_data | |
| # # Remove rows where 'comments' is an empty string | |
| # data = data[data['comments'] != ""] | |
| # data["descriptions"] = data["url"].apply(getSearchPostDescription) | |
| # data.to_csv(fileName, index=False) | |
| # print("Data saved to",fileName) | |
| # from reddit.reddit_competitor_analysis import getCompetitorNamesFromReddit | |
| # from reddit.api_keys import api_key,api_key2, api_key3 | |
| # import google.generativeai as genai | |
| # genai.configure(api_key=api_key3) | |
| # getCompetitorNamesFromReddit(user_query='AI powered personalized skin care recommendations',isSolo=True,fileName='posts_data_1732244547776200.csv',last_chat_session=None) | |
| # senti = SentimentAnalysis() | |
| # senti.generate_sentiment_and_emotion_from_data(fileName='posts_data_1732297356855788.csv') | |
| # import asyncio | |
| # async def fetchComments(): | |
| # # async def fetchComments(): | |
| # await getPostComments(fileName='posts_data_1732377539625804.csv') | |
| # # await asyncio.wait_for(asyncio.to_thread(getPostComments, fileName='posts_data_1732377539625804.csv')) | |
| # # Run the fetchComments function | |
| # if __name__ == "__main__": | |
| # asyncio.run(fetchComments()) | |
| # import asyncio | |
| # async def long_running_task(): | |
| # await asyncio.sleep(5) # Simulate a long-running task | |
| # print("hello world") | |
| # async def main(): | |
| # try: | |
| # await asyncio.wait_for(long_running_task(), timeout=500) # Set timeout to 2 seconds | |
| # except asyncio.TimeoutError: | |
| # print("The task timed out!") | |
| # asyncio.run(main()) | |
| # async def fetch_comments(): | |
| # reddit = asyncpraw.Reddit( | |
| # client_id="yjGfys3QZPpdCpNZl25Kig", | |
| # client_secret="dqoc8LrQBQhB_IgjV-lKyrD9lBPftg", | |
| # password="&honeyB90", | |
| # user_agent="Curious", | |
| # username="Final-Difference7055", | |
| # ) | |
| # subreddit = await reddit.subreddit("python") | |
| # async for submission in subreddit.new(limit=2): | |
| # print(f"Post: {submission.title}") | |
| # # Load the submission's comments | |
| # await submission.load() | |
| # # Iterate over the comments | |
| # async for comment in submission.comments: | |
| # if isinstance(comment, asyncpraw.models.Comment): | |
| # print(f" - {comment.body}") | |
| # # Close the Reddit client | |
| # await reddit.close() | |
| # asyncio.run(fetch_comments()) | |
| # asyncio.run(getPostComments(file_name='posts_data_1732377539625804.csv')) | |
| # import asyncio | |
| # import heapq | |
| # import time | |
| # import pandas as pd | |
| # import json | |
| # from asyncpraw.models import Comment | |
| # from threading import Lock | |
| # class PriorityQueue: | |
| # def __init__(self): | |
| # self.heap = [] | |
| # self.lock = Lock() | |
| # def push(self, priority, timestamp, query): | |
| # with self.lock: | |
| # heapq.heappush(self.heap, (priority, timestamp, query)) | |
| # def pop(self): | |
| # with self.lock: | |
| # return heapq.heappop(self.heap) | |
| # def is_empty(self): | |
| # with self.lock: | |
| # return len(self.heap) == 0 | |
| # async def process_comment(comment, reply_limit): | |
| # """ | |
| # Recursively process a comment and its replies up to the specified reply limit. | |
| # """ | |
| # comment_data = { | |
| # "user": comment.author.name if comment.author else "Unknown", | |
| # "comment": comment.body, | |
| # "replies": [] | |
| # } | |
| # if hasattr(comment, "replies") and comment.replies: | |
| # for reply in comment.replies: | |
| # if reply_limit == 0: | |
| # break | |
| # reply_data = await process_comment(reply, reply_limit=reply_limit - 1) | |
| # comment_data["replies"].append(reply_data) | |
| # return comment_data | |
| # async def fetch_submission(queue, reddit, data, exit_event, output_file): | |
| # """ | |
| # Fetch Reddit submissions from the queue and process their comments. | |
| # Save the output to the provided file. | |
| # """ | |
| # while not exit_event.is_set(): | |
| # if not queue.is_empty(): | |
| # _, _, (url, index) = queue.pop() | |
| # print(f"Fetching comments for: {url}") | |
| # try: | |
| # submission = await asyncio.wait_for(reddit.submission(url=url), timeout=30) | |
| # await submission.comments.replace_more(limit=2) | |
| # await submission.load() | |
| # # Process comments | |
| # comments_data = [] | |
| # comment_queue = list(submission.comments) | |
| # comment_count = 0 | |
| # threshold = 40 # Set your desired threshold | |
| # while comment_queue: | |
| # if comment_count >= threshold: | |
| # break | |
| # comment = comment_queue.pop(0) | |
| # if isinstance(comment, Comment): | |
| # comment_data = await process_comment(comment, reply_limit=3) | |
| # comments_data.append(comment_data) | |
| # comment_count += 1 | |
| # print(f"Processed comments for: {url}") | |
| # # Save comments data to the dataframe | |
| # data.at[index, "comments"] = json.dumps(comments_data) # Save as JSON string | |
| # except asyncio.TimeoutError: | |
| # print(f"Timeout while processing: {url}") | |
| # except Exception as e: | |
| # print(f"Error processing {url}: {e}") | |
| # # Save the data to the output file periodically | |
| # data.to_csv(output_file, index=False) | |
| # else: | |
| # await asyncio.sleep(1) | |
| # async def load_urls_to_queue(queue, file_name, data, exit_event): | |
| # """ | |
| # Load URLs from a CSV file into the processing queue. | |
| # """ | |
| # for index, row in data.iterrows(): | |
| # queue.push(0, time.time(), (row['url'], index)) # Push each URL and index into the queue | |
| # print("All URLs loaded into the queue.") | |
| # while not exit_event.is_set(): | |
| # await asyncio.sleep(1) | |
| # async def main(input_file, output_file): | |
| # reddit = asyncpraw.Reddit( | |
| # client_id="yjGfys3QZPpdCpNZl25Kig", | |
| # client_secret="dqoc8LrQBQhB_IgjV-lKyrD9lBPftg", | |
| # password="&honeyB90", | |
| # user_agent="Curious", | |
| # username="Final-Difference7055", | |
| # ) | |
| # """ | |
| # Main function to coordinate queue loading and submission processing. | |
| # """ | |
| # queue = PriorityQueue() | |
| # exit_event = asyncio.Event() | |
| # # Load the input file | |
| # data = pd.read_csv(input_file) | |
| # data["comments"] = "" # Add a new column for storing comments | |
| # # Start tasks | |
| # loader_task = asyncio.create_task(load_urls_to_queue(queue, input_file, data, exit_event)) | |
| # processor_task = asyncio.create_task(fetch_submission(queue, reddit, data, exit_event, output_file)) | |
| # try: | |
| # await asyncio.gather(loader_task, processor_task) | |
| # except KeyboardInterrupt: | |
| # print("Exiting... Setting exit event.") | |
| # exit_event.set() | |
| # await asyncio.gather(loader_task, processor_task) | |
| # if __name__ == "__main__": | |
| # import asyncpraw | |
| # # Provide your input and output file paths | |
| # input_file = "posts_data_1732377539625804.csv" # Input file with a column 'url' | |
| # output_file = "output_file.csv" # Output file to save results | |
| # asyncio.run(main(input_file, output_file)) | |