Spaces:

honeybansal23
/

nextAnalytics

Runtime error

App Files Files Community

honey234 commited on Jan 10, 2025

Commit

213206a

1 Parent(s): e56eed8

fixed bugs and added retry logics

Browse files

Files changed (6) hide show

apis/reddit_apis.py +4 -3
reddit/prompts.py +1 -1
reddit/reddit_competitor_analysis.py +47 -47
reddit/reddit_functions.py +76 -39
reddit/reddit_gemini.py +1 -1
reddit/scraping.py +0 -1

apis/reddit_apis.py CHANGED Viewed

@@ -9,7 +9,7 @@ from models.pain_point_model import PainPointAnalysisModel
 from models.reddit_models import  RedditPostDataModel
 from models.session_model import InputInfoModel
 from reddit.reddit_competitor_analysis import getCompetitorAnalysisData
-from reddit.reddit_functions import getRedditData
 from reddit.reddit_gemini import getKeywords
 from reddit.reddit_pain_point_analysis import pain_point_analysis
 from reddit.reddit_utils import reddit_services_names
@@ -84,7 +84,8 @@ async def getRedditPostsData(request: RedditPostDataModel):
         if not search_keywords:
             raise HTTPException(status_code=400, detail="Search keywords must not be empty")
         print("user_query",user_query,"search_keywords",search_keywords)
-        result = await getRedditData(user_query=user_query, search_keywords=search_keywords)
         return result
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(f"Failed to run getRedditPostsData : {e}"))
@@ -163,7 +164,7 @@ async def analyzeData(inputData:InputInfoModel,user_session:dict):
     try:
         keywords = getKeywords(user_query=inputData.query)
-        reddit_data_result = await getRedditData(user_query=keywords['query'], search_keywords=keywords['top_3_combinations'])
         update_user_session(user_session=user_session,session_info=session_info_result,process_info=process_info)
         services_result,session_info_result = await getServices(

 from models.reddit_models import  RedditPostDataModel
 from models.session_model import InputInfoModel
 from reddit.reddit_competitor_analysis import getCompetitorAnalysisData
+from reddit.reddit_functions import  getRedditData_with_timeout
 from reddit.reddit_gemini import getKeywords
 from reddit.reddit_pain_point_analysis import pain_point_analysis
 from reddit.reddit_utils import reddit_services_names
         if not search_keywords:
             raise HTTPException(status_code=400, detail="Search keywords must not be empty")
         print("user_query",user_query,"search_keywords",search_keywords)
+        result = await getRedditData_with_timeout(user_query=user_query, search_keywords=search_keywords)
+        print('getRedditPostsData: ', result)
         return result
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(f"Failed to run getRedditPostsData : {e}"))
     try:
         keywords = getKeywords(user_query=inputData.query)
+        reddit_data_result = await getRedditData_with_timeout(user_query=keywords['query'], search_keywords=keywords['top_3_combinations'])
         update_user_session(user_session=user_session,session_info=session_info_result,process_info=process_info)
         services_result,session_info_result = await getServices(

reddit/prompts.py CHANGED Viewed

@@ -112,7 +112,7 @@ def featureAnalysisPrompt():
 def getPainPointAnalysisPrompt(user_query):
     return f"""
-    Analyze the file_with_sentiment.csv data of Reddit posts for the user query = "{user_query}" to perform **pain point analysis**. Use the categories derived internally for the analysis, but do not return them. Focus only on the detailed pain point analysis results.
 Return the response in the **JSON format** provided below, and include data for all categories identified during your internal process. Ensure your response adheres strictly to this structure and **do not include any intermediate data or steps**.

 def getPainPointAnalysisPrompt(user_query):
     return f"""
+    Analyze the given csv data of Reddit posts for the user query = "{user_query}" to perform **pain point analysis**. Use the categories derived internally for the analysis, but do not return them. Focus only on the detailed pain point analysis results.
 Return the response in the **JSON format** provided below, and include data for all categories identified during your internal process. Ensure your response adheres strictly to this structure and **do not include any intermediate data or steps**.

reddit/reddit_competitor_analysis.py CHANGED Viewed

@@ -135,6 +135,7 @@ async def getPostDataofCompetitor(fileName, user_query):
     unique_list = get_microseconds_list(length=len(df))
     actual_list = []
     count=0
     # Use ThreadPoolExecutor to run tasks concurrently
     with concurrent.futures.ThreadPoolExecutor(max_workers=len(scraper_ant_keys)) as executor:
         futures = []
@@ -152,7 +153,8 @@ async def getPostDataofCompetitor(fileName, user_query):
                     result = future.result()
                     if result is not None:
                       actual_list.append(result)
-                      count+=1
                 futures = []
         if futures:
@@ -160,7 +162,8 @@ async def getPostDataofCompetitor(fileName, user_query):
                 result = future.result()
                 if result is not None:
                   actual_list.append(result)
-                  count+=1
     print("Fetched data for competitors")
     fileNames = [f"posts_data_{actual_list[i]}.csv" for i in range(len(actual_list))]
@@ -186,7 +189,7 @@ async def getPostDataofCompetitor(fileName, user_query):
                 )
             # # Proceed with preprocessing
-            result = preprocessingCompetitorsData(user_query=user_query, fileNames=fileNames)
             return result
         except Exception as e:
             traceback.print_exc()
@@ -195,20 +198,21 @@ async def getPostDataofCompetitor(fileName, user_query):
         return {'details': 'No data found'}
-def preprocessingCompetitorsData(user_query,fileNames):
   c=0
   competitors_json_data = []
   try:
     for i in range(len(fileNames)):
       if c==6:break
       print(f"Processing file {fileNames[i]}")
       json_data = getCompetitorAnalysisReport(user_query=user_query,fileName=fileNames[i],count=c)
       c+=1
       # if json_data does  not contain "details" field, then only save the json
       if "details" not in json_data.keys():
         print("Competitor Analysis Report",f"competitor_analysis_report_{fileNames[i]}.json")
         competitors_json_data.append(json_data)
     for file_path in fileNames:
       # Check if the file exists before attempting to delete
@@ -222,56 +226,52 @@ def preprocessingCompetitorsData(user_query,fileNames):
     traceback.print_exc()
     return competitors_json_data
 def getCompetitorAnalysisReport(user_query,fileName,count=0):
-  try:
-    prompt = getCompetitorPrompt(user_query=user_query)
-    api_key_map = {
-          0: api_key5,
-          1: api_key6,
-          2: api_key7,
-          3: api_key8,
-          4: api_key9,
-          5: api_key10
-      }
-    selected_api_key = api_key_map.get(count, api_key8)  # Default to api_key8 if count > 5
-    genai.configure(api_key=selected_api_key)
-    data = getModelAndGenerationConfigCommon(fileName=fileName,modelName='gemini-2.0-flash-exp')
-    model = data[0]
-    chat_session = model.start_chat(
-        history=[
-          {
-            "role": "user",
-            "parts": [
-              data[1],
-              prompt
-            ],
-          }
-        ]
-      )
     try:
       response = chat_session.send_message("give your last response of competitor analysis")
       data = response.text
       json_data =json.loads(data)
-      print("competitor analysis done for ",user_query)
       return json_data
-    except:
-      try:
-        # retry
-        response = chat_session.send_message("give your last response of competitor analysis")
-        data = response.text
-        json_data =json.loads(data)
-        print("retry competitor analysis done for ",user_query)
-        return json_data
-      except Exception as e:
-        print(f"competitor analysis error {api_key_map[count]}",str(e))
-        traceback.print_exc()
-        return {"details": str(e)}
-  except Exception as e:
-    print(f"competitor analysis error {api_key_map[count]}",str(e))
-    traceback.print_exc()
-    return {"details": str(e)}
 async def getCompetitorAnalysisData(user_query,fileName):
     start_time = time.time()

     unique_list = get_microseconds_list(length=len(df))
     actual_list = []
     count=0
+    competitor_names = []
     # Use ThreadPoolExecutor to run tasks concurrently
     with concurrent.futures.ThreadPoolExecutor(max_workers=len(scraper_ant_keys)) as executor:
         futures = []
                     result = future.result()
                     if result is not None:
                       actual_list.append(result)
+                      competitor_names.append(df.iloc[count]['name'])
+                    count+=1
                 futures = []
         if futures:
                 result = future.result()
                 if result is not None:
                   actual_list.append(result)
+                  competitor_names.append(df.iloc[count]['name'])
+                count+=1
     print("Fetched data for competitors")
     fileNames = [f"posts_data_{actual_list[i]}.csv" for i in range(len(actual_list))]
                 )
             # # Proceed with preprocessing
+            result = preprocessingCompetitorsData(user_query=user_query, fileNames=fileNames,competitor_names=competitor_names)
             return result
         except Exception as e:
             traceback.print_exc()
         return {'details': 'No data found'}
+def preprocessingCompetitorsData(user_query,fileNames,competitor_names):
   c=0
   competitors_json_data = []
   try:
     for i in range(len(fileNames)):
       if c==6:break
       print(f"Processing file {fileNames[i]}")
+      print('competitor NAme ', competitor_names[i])
       json_data = getCompetitorAnalysisReport(user_query=user_query,fileName=fileNames[i],count=c)
       c+=1
       # if json_data does  not contain "details" field, then only save the json
       if "details" not in json_data.keys():
         print("Competitor Analysis Report",f"competitor_analysis_report_{fileNames[i]}.json")
         competitors_json_data.append(json_data)
+        print('competitor Analysis success for ', competitor_names[i])
     for file_path in fileNames:
       # Check if the file exists before attempting to delete
     traceback.print_exc()
     return competitors_json_data
 def getCompetitorAnalysisReport(user_query,fileName,count=0):
+  prompt = getCompetitorPrompt(user_query=user_query)
+  api_key_map = {
+        0: api_key5,
+        1: api_key6,
+        2: api_key7,
+        3: api_key8,
+        4: api_key9,
+        5: api_key10
+    }
+  selected_api_key = api_key_map.get(count, api_key8)  # Default to api_key8 if count > 5
+  genai.configure(api_key=selected_api_key)
+  data = getModelAndGenerationConfigCommon(fileName=fileName,modelName='gemini-2.0-flash-exp')
+  model = data[0]
+  chat_session = model.start_chat(
+      history=[
+        {
+          "role": "user",
+          "parts": [
+            data[1],
+            prompt
+          ],
+        }
+      ]
+    )
+  try:
+    response = chat_session.send_message("give your last response of competitor analysis")
+    data = response.text
+    json_data =json.loads(data)
+    print("competitor analysis done for ",user_query)
+    return json_data
+  except:
     try:
+      # retry
       response = chat_session.send_message("give your last response of competitor analysis")
       data = response.text
       json_data =json.loads(data)
+      print("retry competitor analysis done for ",user_query)
       return json_data
+    except Exception as e:
+      print(f"competitor analysis error {api_key_map[count]}",str(e))
+      traceback.print_exc()
+      return {"details": str(e)}
 async def getCompetitorAnalysisData(user_query,fileName):
     start_time = time.time()

reddit/reddit_functions.py CHANGED Viewed

@@ -5,82 +5,119 @@ from reddit.reddit_sentiment_analysis import SentimentAnalysis
 from reddit.reddit_utils import get_microseconds_list
 from reddit.scraping import getPostComments, getSearchPostData
 import time
 async def getRedditData(user_query, search_keywords):
     unique_list = get_microseconds_list()
     successful_steps = []
-    # Record the start time
     start_time = time.time()
-    fileNames=[]
     # Step 1: Get search post data
     try:
-        # Use ThreadPoolExecutor to run tasks concurrently
         with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
             futures = []
-            count =0
-            # Submit tasks in batches of 3
             for i in range(len(search_keywords)):
-                print(f'Running task {i}')
                 future = executor.submit(getSearchPostData, search_keyword=search_keywords[i], index=unique_list[i], position=i)
                 futures.append(future)
                 if len(futures) == 3:
                     for future in concurrent.futures.as_completed(futures):
                         result = future.result()
-                        if result is not None:
                             fileNames.append(f"posts_data_{result}.csv")
-                            successful_steps.append(('getSearchPostData', count))  # Mark this step as successful
-                            count+=1
                     futures = []
             if futures:
                 for future in concurrent.futures.as_completed(futures):
                     result = future.result()
-                    if result is not None:
                         fileNames.append(f"posts_data_{result}.csv")
-                        successful_steps.append(('getSearchPostData', count))  # Mark this step as successful
-                        count+=1
     except Exception as e:
-        print(f"Failed at getSearchPostData: {e}")
-    # Step 3: Get final data
     try:
-        print("fileNames", fileNames)
-        res=getFinalData(user_query=user_query, filesNames=fileNames)
         if res is True:
-            successful_steps.append(('getFinalData'))  # Mark this step as successful
     except Exception as e:
-        print(f"Failed at getFinalData: {e}")
-    # Step 4: Get post comments
     try:
         await getPostComments(file_name=fileNames[0])
-        successful_steps.append(('getPostComments',))  # Mark this step as successful
     except Exception as e:
-        print(f"Failed at getPostComments: {e}")
-    # Record the time just after getting post comments
-    time_after_comments = time.time()
-    elapsed_time_after_comments = time_after_comments - start_time
-    # Start timer for sentiment file
     start_time = time.time()
-    # Step 5: Get sentiment of post comments
     try:
         sentiment_instance = SentimentAnalysis()
         sentiment_instance.generate_sentiment_and_emotion_from_data(fileName=fileNames[0])
-        successful_steps.append(('getPostSentiment',))  # Mark this step as successful
     except Exception as e:
-        print(f"Failed at getPostSentiment: {e}")
-    time_after_sentiment = time.time()
-    # Optionally, return the successful steps for logging or further processing
     return {
-        "fileName":fileNames[0],
-        "fileUniqueId": str(unique_list[0]),
         "successful_steps": successful_steps,
-        "reddit_data": elapsed_time_after_comments,
-        "sentiment_data": time_after_sentiment - start_time,
     }

 from reddit.reddit_utils import get_microseconds_list
 from reddit.scraping import getPostComments, getSearchPostData
 import time
+import asyncio
+import time
+import os
+import concurrent.futures
+async def delete_files(file_names):
+    """Helper function to delete created files."""
+    for file_name in file_names:
+        try:
+            if os.path.exists(file_name):
+                os.remove(file_name)
+                print(f"Deleted file: {file_name}")
+        except Exception as e:
+            print(f"Error deleting file {file_name}: {e}")
+async def run_with_timeout(task_func, *args, timeout=300):
+    """Runs a task with a timeout."""
+    try:
+        return await asyncio.wait_for(task_func(*args), timeout=timeout)
+    except asyncio.TimeoutError:
+        print(f"Task exceeded {timeout} seconds timeout.")
+        raise
+async def getRedditData_with_timeout(user_query, search_keywords, retries=1, timeout=300):
+    """Retries the getRedditData process with a timeout."""
+    file_names = []
+    for attempt in range(retries + 1):
+        try:
+            result = await run_with_timeout(getRedditData, user_query, search_keywords, timeout=timeout)
+            return result
+        except Exception as e:
+            print(f"Attempt {attempt + 1} failed with error: {e}")
+            await delete_files(file_names)  # Delete created files
+            if attempt == retries:
+                raise Exception("Process failed after retries.") from e
 async def getRedditData(user_query, search_keywords):
     unique_list = get_microseconds_list()
     successful_steps = []
     start_time = time.time()
+    fileNames = []
+    def log_step_time(step_name, start_time, success=True, error=None):
+        elapsed = time.time() - start_time
+        if success:
+            print(f"{step_name} completed successfully in {elapsed:.2f} seconds.")
+        else:
+            print(f"{step_name} failed in {elapsed:.2f} seconds. Error: {error}")
     # Step 1: Get search post data
     try:
+        step_start = time.time()
         with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
             futures = []
+            count = 0
             for i in range(len(search_keywords)):
                 future = executor.submit(getSearchPostData, search_keyword=search_keywords[i], index=unique_list[i], position=i)
                 futures.append(future)
                 if len(futures) == 3:
                     for future in concurrent.futures.as_completed(futures):
                         result = future.result()
+                        if result:
                             fileNames.append(f"posts_data_{result}.csv")
+                            successful_steps.append(('getSearchPostData', count))
+                            count += 1
                     futures = []
             if futures:
                 for future in concurrent.futures.as_completed(futures):
                     result = future.result()
+                    if result:
                         fileNames.append(f"posts_data_{result}.csv")
+                        successful_steps.append(('getSearchPostData', count))
+                        count += 1
+        log_step_time("getSearchPostData", step_start)
     except Exception as e:
+        log_step_time("getSearchPostData", step_start, success=False, error=e)
+    # Step 2: Get final data
     try:
+        step_start = time.time()
+        res = getFinalData(user_query=user_query, filesNames=fileNames)
         if res is True:
+            successful_steps.append(('getFinalData'))
+        log_step_time("getFinalData", step_start)
     except Exception as e:
+        log_step_time("getFinalData", step_start, success=False, error=e)
+    # Step 3: Get post comments
     try:
+        step_start = time.time()
         await getPostComments(file_name=fileNames[0])
+        successful_steps.append(('getPostComments',))
+        log_step_time("getPostComments", step_start)
     except Exception as e:
+        log_step_time("getPostComments", step_start, success=False, error=e)
+    reddit_time = time.time() - start_time
     start_time = time.time()
+    # Step 4: Get sentiment of post comments
     try:
+        step_start = time.time()
         sentiment_instance = SentimentAnalysis()
         sentiment_instance.generate_sentiment_and_emotion_from_data(fileName=fileNames[0])
+        successful_steps.append(('getPostSentiment',))
+        log_step_time("getPostSentiment", step_start)
     except Exception as e:
+        log_step_time("getPostSentiment", step_start, success=False, error=e)
+    sentiment_time = time.time()-start_time
     return {
+        "fileName": fileNames[0] if fileNames else None,
+        'reddit_data':reddit_time,
+        'sentiment_data':sentiment_time,
+        "fileUniqueId": str(unique_list[0]) if unique_list else None,
         "successful_steps": successful_steps,
     }

reddit/reddit_gemini.py CHANGED Viewed

@@ -7,7 +7,7 @@ from reddit.prompts import getKeywordsPrompt
 def getKeywords(user_query: str):
     prompt = getKeywordsPrompt(user_query)
-    model = genai.GenerativeModel("gemini-exp-1114")
     generation_config = genai.GenerationConfig(response_mime_type="application/json")
     try:

 def getKeywords(user_query: str):
     prompt = getKeywordsPrompt(user_query)
+    model = genai.GenerativeModel("gemini-2.0-flash-exp")
     generation_config = genai.GenerationConfig(response_mime_type="application/json")
     try:

reddit/scraping.py CHANGED Viewed

@@ -302,7 +302,6 @@ async def getPostComments(file_name, is_for_competitor_analysis=False, index=0):
         if comments_json is not None:
             for i in range(len(data)):
                 if comments_json[i] is not None:
-                    print('Comment', comments_json[i]['index'], i)
                     data.at[comments_json[i]['index'], 'comments'] = {'comments':comments_json[i]['comments']}
                     data.at[comments_json[i]['index'], 'descriptions'] = comments_json[i]['description']
                 else:

         if comments_json is not None:
             for i in range(len(data)):
                 if comments_json[i] is not None:
                     data.at[comments_json[i]['index'], 'comments'] = {'comments':comments_json[i]['comments']}
                     data.at[comments_json[i]['index'], 'descriptions'] = comments_json[i]['description']
                 else: