Spaces:
Runtime error
Runtime error
Commit ·
c3837c5
1
Parent(s): 3e5b99f
added code
Browse files- Dockerfile +31 -0
- venv/.gitignore +54 -0
- venv/app.py +156 -0
- venv/models/reddit_models.py +6 -0
- venv/posts_data_1732244547776200.csv +0 -0
- venv/pyvenv.cfg +5 -0
- venv/reddit/api_keys.py +13 -0
- venv/reddit/prompts.py +340 -0
- venv/reddit/reddit_call.py +42 -0
- venv/reddit/reddit_community_post_scraper.py +66 -0
- venv/reddit/reddit_competitor_analysis.py +217 -0
- venv/reddit/reddit_functions.py +49 -0
- venv/reddit/reddit_gemini.py +70 -0
- venv/reddit/reddit_pain_point_analysis.py +51 -0
- venv/reddit/reddit_search_scrapper.py +55 -0
- venv/reddit/reddit_sentiment_analysis.py +53 -0
- venv/reddit/reddit_utils.py +38 -0
- venv/reddit/scraping.py +223 -0
- venv/test.py +200 -0
- venv/utils.py +26 -0
Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python runtime as a parent image
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
# Set the working directory to /app
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install required C++11 libraries and ca-certificates
|
| 8 |
+
RUN apt-get update -qq \
|
| 9 |
+
&& apt-get install -y \
|
| 10 |
+
build-essential \
|
| 11 |
+
python3-dev \
|
| 12 |
+
ca-certificates \
|
| 13 |
+
&& apt-get clean \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
+
|
| 16 |
+
# Copy the current directory contents into the container at /app
|
| 17 |
+
COPY . /app
|
| 18 |
+
|
| 19 |
+
# Create and set up a virtual environment inside the container
|
| 20 |
+
RUN python -m venv /app/venv \
|
| 21 |
+
&& /app/venv/bin/pip install --no-cache-dir --upgrade pip \
|
| 22 |
+
&& /app/venv/bin/pip install --no-cache-dir -r requirements.txt
|
| 23 |
+
|
| 24 |
+
# Ensure the virtual environment's Python and pip are used by default
|
| 25 |
+
ENV PATH="/app/venv/bin:$PATH"
|
| 26 |
+
|
| 27 |
+
# Make port 8000 available to the world outside this container
|
| 28 |
+
EXPOSE 8000
|
| 29 |
+
|
| 30 |
+
# Run the application when the container launches
|
| 31 |
+
CMD ["uvicorn", "main:app", "--host", "127.0.0.23", "--port", "7860"]
|
venv/.gitignore
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# Virtual environment
|
| 7 |
+
venv/
|
| 8 |
+
ENV/
|
| 9 |
+
env/
|
| 10 |
+
.venv/
|
| 11 |
+
.ENV/
|
| 12 |
+
|
| 13 |
+
# Jupyter Notebook checkpoints
|
| 14 |
+
.ipynb_checkpoints
|
| 15 |
+
|
| 16 |
+
# VS Code files
|
| 17 |
+
.vscode/
|
| 18 |
+
|
| 19 |
+
# PyCharm files
|
| 20 |
+
.idea/
|
| 21 |
+
|
| 22 |
+
# Distribution / Packaging
|
| 23 |
+
build/
|
| 24 |
+
dist/
|
| 25 |
+
Lib/
|
| 26 |
+
Include/
|
| 27 |
+
Scripts/
|
| 28 |
+
*.egg-info/
|
| 29 |
+
*.egg
|
| 30 |
+
*.whl
|
| 31 |
+
|
| 32 |
+
# Logs and debug files
|
| 33 |
+
*.log
|
| 34 |
+
|
| 35 |
+
# Test results
|
| 36 |
+
*.out
|
| 37 |
+
*.coverage
|
| 38 |
+
.coverage.*
|
| 39 |
+
|
| 40 |
+
# Environment variables and settings
|
| 41 |
+
.env
|
| 42 |
+
*.env
|
| 43 |
+
|
| 44 |
+
# Cache files
|
| 45 |
+
*.cache
|
| 46 |
+
.cache/
|
| 47 |
+
pip-log.txt
|
| 48 |
+
pip-delete-this-directory.txt
|
| 49 |
+
|
| 50 |
+
# macOS files
|
| 51 |
+
.DS_Store
|
| 52 |
+
|
| 53 |
+
# Windows files
|
| 54 |
+
Thumbs.db
|
venv/app.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import deque
|
| 2 |
+
from fastapi import FastAPI, HTTPException
|
| 3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
+
from models.reddit_models import RedditPostDataModel
|
| 5 |
+
from reddit.reddit_functions import getRedditData
|
| 6 |
+
from reddit.reddit_gemini import getKeywords
|
| 7 |
+
from reddit.api_keys import api_key,api_key2
|
| 8 |
+
import google.generativeai as genai
|
| 9 |
+
|
| 10 |
+
from reddit.reddit_pain_point_analysis import pain_point_analysis
|
| 11 |
+
from reddit.reddit_search_scrapper import getCompetitorAnalysisData
|
| 12 |
+
from utils import time_execution
|
| 13 |
+
app = FastAPI()
|
| 14 |
+
import asyncio
|
| 15 |
+
from fastapi import HTTPException, APIRouter
|
| 16 |
+
from asyncio import TimeoutError
|
| 17 |
+
|
| 18 |
+
# Assuming you have defined the necessary imports, e.g., config, getKeywords, api_key, api_key2
|
| 19 |
+
|
| 20 |
+
# CORS configuration
|
| 21 |
+
origins = [
|
| 22 |
+
"*",
|
| 23 |
+
# Add more origins as needed
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
app.add_middleware(
|
| 27 |
+
CORSMiddleware,
|
| 28 |
+
allow_origins=origins, # Allows specified origins
|
| 29 |
+
allow_credentials=False,
|
| 30 |
+
allow_methods=["*"], # Allows all HTTP methods
|
| 31 |
+
allow_headers=["*"], # Allows all headers
|
| 32 |
+
)
|
| 33 |
+
from fastapi import FastAPI
|
| 34 |
+
from functools import wraps
|
| 35 |
+
|
| 36 |
+
app = FastAPI()
|
| 37 |
+
|
| 38 |
+
class Config:
|
| 39 |
+
def __init__(self):
|
| 40 |
+
self.called = False
|
| 41 |
+
self.retry=False
|
| 42 |
+
self.chat_session=None
|
| 43 |
+
self.queue=deque([])
|
| 44 |
+
|
| 45 |
+
def configure(self, api_key):
|
| 46 |
+
genai.configure(api_key=api_key)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
config = Config()
|
| 50 |
+
|
| 51 |
+
@app.get("/")
|
| 52 |
+
@time_execution
|
| 53 |
+
def read_root():
|
| 54 |
+
if not config.called:
|
| 55 |
+
print("called", config.called)
|
| 56 |
+
if config.retry:
|
| 57 |
+
config.configure(api_key=api_key2)
|
| 58 |
+
config.called = True
|
| 59 |
+
else:
|
| 60 |
+
config.configure(api_key=api_key)
|
| 61 |
+
config.called = True
|
| 62 |
+
return {"message": "Hello, World!"}
|
| 63 |
+
|
| 64 |
+
# Timeout handler: check if getKeywords takes too long
|
| 65 |
+
async def fetch_keywords_with_timeout(user_query: str, timeout: int = 60, retry: bool = True):
|
| 66 |
+
try:
|
| 67 |
+
# Simulate the getKeywords function with a timeout using asyncio.wait_for
|
| 68 |
+
keywords = await asyncio.wait_for(asyncio.to_thread(getKeywords, user_query), timeout=timeout)
|
| 69 |
+
return keywords
|
| 70 |
+
except TimeoutError:
|
| 71 |
+
print("Timeout exceeded, switching to api_key2")
|
| 72 |
+
if retry:
|
| 73 |
+
config.called = False
|
| 74 |
+
config.retry=True
|
| 75 |
+
# Timeout exceeded, switch to api_key2 and retry fetching keywords
|
| 76 |
+
read_root() # Switch API key
|
| 77 |
+
# Retry fetching keywords
|
| 78 |
+
return await fetch_keywords_with_timeout(user_query, timeout, retry=False) # Set retry to False to prevent infinite loop
|
| 79 |
+
else:
|
| 80 |
+
# If we already tried once, handle as a failure or return a fallback response
|
| 81 |
+
raise HTTPException(status_code=504, detail="Request timed out even after retrying")
|
| 82 |
+
|
| 83 |
+
@app.get("/keywords")
|
| 84 |
+
@time_execution
|
| 85 |
+
async def fetch_keywords(user_query: str):
|
| 86 |
+
if not user_query:
|
| 87 |
+
raise HTTPException(status_code=400, detail="User query must not be empty")
|
| 88 |
+
|
| 89 |
+
# Fetch keywords with a 10-second timeout and retry mechanism
|
| 90 |
+
keywords = await fetch_keywords_with_timeout(user_query=user_query)
|
| 91 |
+
return keywords
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
@app.post("/getRedditPostsData")
|
| 95 |
+
@time_execution
|
| 96 |
+
def getRedditPostsData(request: RedditPostDataModel):
|
| 97 |
+
"""Requires user_query and search_keywords as arguments.
|
| 98 |
+
Steps involved in this api:
|
| 99 |
+
1. get posts data from reddit
|
| 100 |
+
2. filter top 18 posts
|
| 101 |
+
3. get comments data
|
| 102 |
+
4. get sentiment data
|
| 103 |
+
"""
|
| 104 |
+
try:
|
| 105 |
+
# Extract user_query and search_keywords from the request body
|
| 106 |
+
user_query = request.user_query
|
| 107 |
+
search_keywords = request.search_keywords
|
| 108 |
+
|
| 109 |
+
if not user_query:
|
| 110 |
+
raise HTTPException(status_code=400, detail="User query must not be empty")
|
| 111 |
+
|
| 112 |
+
if not search_keywords:
|
| 113 |
+
raise HTTPException(status_code=400, detail="Search keywords must not be empty")
|
| 114 |
+
print("user_query",user_query,"search_keywords",search_keywords)
|
| 115 |
+
result = getRedditData(user_query=user_query, search_keywords=search_keywords)
|
| 116 |
+
return result
|
| 117 |
+
except Exception as e:
|
| 118 |
+
raise HTTPException(status_code=500, detail=str(f"Failed to run getRedditPostsData : {e}"))
|
| 119 |
+
|
| 120 |
+
# pain point analysis api which takes user_query and fileName as arguments
|
| 121 |
+
@app.get("/getPainPointAnalysis")
|
| 122 |
+
@time_execution
|
| 123 |
+
def getPainPointAnalysis(user_query: str, fileName: str, uniqueFileId: str):
|
| 124 |
+
try:
|
| 125 |
+
# Extract user_query and fileName
|
| 126 |
+
if not user_query:
|
| 127 |
+
raise HTTPException(status_code=400, detail="User query must not be empty")
|
| 128 |
+
|
| 129 |
+
if not fileName:
|
| 130 |
+
raise HTTPException(status_code=400, detail="fileName must not be empty")
|
| 131 |
+
print("user_query",user_query,"fileName",fileName)
|
| 132 |
+
result=pain_point_analysis(user_query=user_query,fileName=fileName,uniqueFileId=uniqueFileId)
|
| 133 |
+
config.chat_session= result[1]
|
| 134 |
+
return result[0]
|
| 135 |
+
except Exception as e:
|
| 136 |
+
raise HTTPException(status_code=500, detail=str(f"Failed to run getPainPointAnalysis : {e}"))
|
| 137 |
+
|
| 138 |
+
# pain point analysis api which takes user_query and fileName as arguments
|
| 139 |
+
@app.get("/getCompetitorAnalysis")
|
| 140 |
+
@time_execution
|
| 141 |
+
def getCompetitorAnalysis(user_query: str, fileName: str,isSolo=True):
|
| 142 |
+
try:
|
| 143 |
+
# Extract user_query and fileName
|
| 144 |
+
if not user_query:
|
| 145 |
+
raise HTTPException(status_code=400, detail="User query must not be empty")
|
| 146 |
+
|
| 147 |
+
if not fileName:
|
| 148 |
+
raise HTTPException(status_code=400, detail="fileName must not be empty")
|
| 149 |
+
print("user_query",user_query,"isSolo",isSolo,"fileName",fileName)
|
| 150 |
+
result = getCompetitorAnalysisData(user_query=user_query,isSolo=isSolo,chat_session=config.chat_session,fileName=fileName)
|
| 151 |
+
return result
|
| 152 |
+
except Exception as e:
|
| 153 |
+
raise HTTPException(status_code=500, detail=str(f"Failed to run getCompetitorAnalysis : {e}"))
|
| 154 |
+
# if __name__ == "__main__":
|
| 155 |
+
# import uvicorn
|
| 156 |
+
# uvicorn.run("main:app", host="127.0.0.23", workers=1,reload=True,port=786)
|
venv/models/reddit_models.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
|
| 4 |
+
class RedditPostDataModel(BaseModel):
|
| 5 |
+
user_query: str
|
| 6 |
+
search_keywords: List[str]
|
venv/posts_data_1732244547776200.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
venv/pyvenv.cfg
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
home = C:\Users\HP\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0
|
| 2 |
+
include-system-site-packages = false
|
| 3 |
+
version = 3.11.9
|
| 4 |
+
executable = C:\Users\HP\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe
|
| 5 |
+
command = C:\Users\HP\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m venv D:\development\nextAnalytics\venv
|
venv/reddit/api_keys.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
load_dotenv()
|
| 5 |
+
|
| 6 |
+
api_key = os.getenv('GEMINI_API_KEY')
|
| 7 |
+
api_key2 = os.getenv('GEMINI_SECOND_API_KEY')
|
| 8 |
+
api_key3 = os.getenv('GEMINI_THIRD_API_KEY')
|
| 9 |
+
api_key4 = os.getenv('GEMINI_FOURTH_API_KEY')
|
| 10 |
+
api_key5 = os.getenv('GEMINI_FIVE_API_KEY')
|
| 11 |
+
api_key6 = os.getenv('GEMINI_SIX_API_KEY')
|
| 12 |
+
api_key7 = os.getenv('GEMINI_SEVEN_API_KEY')
|
| 13 |
+
api_key8 = os.getenv('GEMINI_EIGHT_API_KEY')
|
venv/reddit/prompts.py
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def featureAnalysisPrompt():
|
| 2 |
+
return f'''
|
| 3 |
+
{{
|
| 4 |
+
"competitor": "COMPETITOR_NAME",
|
| 5 |
+
"user_query": "USER_QUERY",
|
| 6 |
+
"competitor_data_source": "DATA_SOURCE",
|
| 7 |
+
"overall_sentiment": {{
|
| 8 |
+
"positive_percentage": "PERCENTAGE",
|
| 9 |
+
"negative_percentage": "PERCENTAGE",
|
| 10 |
+
"neutral_percentage": "PERCENTAGE"
|
| 11 |
+
}},
|
| 12 |
+
"features": [
|
| 13 |
+
{{
|
| 14 |
+
"feature": "FEATURE_NAME",
|
| 15 |
+
"feature_description": "FEATURE_DESCRIPTION",
|
| 16 |
+
"sentiment_analysis": {{
|
| 17 |
+
"positive": "PERCENTAGE",
|
| 18 |
+
"negative": "PERCENTAGE",
|
| 19 |
+
"neutral": "PERCENTAGE"
|
| 20 |
+
}},
|
| 21 |
+
"emotion_analysis": {{
|
| 22 |
+
"joy": "PERCENTAGE",
|
| 23 |
+
"anger": "PERCENTAGE",
|
| 24 |
+
"disappointment": "PERCENTAGE",
|
| 25 |
+
"surprise": "PERCENTAGE"
|
| 26 |
+
}},
|
| 27 |
+
"user_personas": {{
|
| 28 |
+
"age_groups": [
|
| 29 |
+
{{
|
| 30 |
+
"age_range": "AGE_RANGE",
|
| 31 |
+
"positive_sentiment_count": "COUNT",
|
| 32 |
+
"negative_sentiment_count": "COUNT",
|
| 33 |
+
"neutral_sentiment_count": "COUNT"
|
| 34 |
+
}},
|
| 35 |
+
{{
|
| 36 |
+
"age_range": "AGE_RANGE",
|
| 37 |
+
"positive_sentiment_count": "COUNT",
|
| 38 |
+
"negative_sentiment_count": "COUNT",
|
| 39 |
+
"neutral_sentiment_count": "COUNT"
|
| 40 |
+
}}
|
| 41 |
+
],
|
| 42 |
+
"gender": [
|
| 43 |
+
{{
|
| 44 |
+
"gender": "GENDER_TYPE",
|
| 45 |
+
"positive_sentiment_count": "COUNT",
|
| 46 |
+
"negative_sentiment_count": "COUNT",
|
| 47 |
+
"neutral_sentiment_count": "COUNT"
|
| 48 |
+
}}
|
| 49 |
+
]
|
| 50 |
+
}},
|
| 51 |
+
"adoption_rate": {{
|
| 52 |
+
"early_adopters": "PERCENTAGE",
|
| 53 |
+
"mainstream_users": "PERCENTAGE",
|
| 54 |
+
"dissatisfied_users": "PERCENTAGE"
|
| 55 |
+
}},
|
| 56 |
+
"usage_behavior": {{
|
| 57 |
+
"frequency_of_use": {{
|
| 58 |
+
"daily": "COUNT",
|
| 59 |
+
"weekly": "COUNT",
|
| 60 |
+
"occasionally": "COUNT",
|
| 61 |
+
"never": "COUNT"
|
| 62 |
+
}},
|
| 63 |
+
"engagement_level": {{
|
| 64 |
+
"high": "COUNT",
|
| 65 |
+
"medium": "COUNT",
|
| 66 |
+
"low": "COUNT"
|
| 67 |
+
}}
|
| 68 |
+
}},
|
| 69 |
+
"feature_comparison": {{
|
| 70 |
+
"strengths": [
|
| 71 |
+
"STRENGTH_1",
|
| 72 |
+
"STRENGTH_2"
|
| 73 |
+
],
|
| 74 |
+
"weaknesses": [
|
| 75 |
+
"WEAKNESS_1",
|
| 76 |
+
"WEAKNESS_2"
|
| 77 |
+
],
|
| 78 |
+
"opportunities_for_improvement": [
|
| 79 |
+
"OPPORTUNITY_1",
|
| 80 |
+
"OPPORTUNITY_2"
|
| 81 |
+
],
|
| 82 |
+
"threats": [
|
| 83 |
+
"THREAT_1",
|
| 84 |
+
"THREAT_2"
|
| 85 |
+
]
|
| 86 |
+
}},
|
| 87 |
+
"pain_points": [
|
| 88 |
+
{{
|
| 89 |
+
"issue": "ISSUE_DESCRIPTION",
|
| 90 |
+
"impact": "ISSUE_IMPACT",
|
| 91 |
+
"mentions": "COUNT",
|
| 92 |
+
"user_sentiment": "SENTIMENT"
|
| 93 |
+
}}
|
| 94 |
+
],
|
| 95 |
+
"feature_improvements_suggestions": [
|
| 96 |
+
{{
|
| 97 |
+
"suggestion": "SUGGESTION_DESCRIPTION",
|
| 98 |
+
"priority": "PRIORITY"
|
| 99 |
+
}}
|
| 100 |
+
]
|
| 101 |
+
}}
|
| 102 |
+
],
|
| 103 |
+
"conclusion": {{
|
| 104 |
+
"summary": "SUMMARY_OF_KEY_FINDINGS",
|
| 105 |
+
"recommendations": [
|
| 106 |
+
"RECOMMENDATION_1",
|
| 107 |
+
"RECOMMENDATION_2"
|
| 108 |
+
]
|
| 109 |
+
}}
|
| 110 |
+
}}
|
| 111 |
+
'''
|
| 112 |
+
|
| 113 |
+
def getPainPointAnalysisPrompt(user_query):
|
| 114 |
+
return f"""
|
| 115 |
+
1. analyze the given csv data of reddit posts with sentiments and provide a list of pain point analysis categories for the user query ={user_query}. categories in pain point analysis should be set of 2 keyword only which emphasis the abstract of pain point and also rank them based on importance and relevancy to user query
|
| 116 |
+
return categories title only which attracts the readers . get your data in json format {{
|
| 117 |
+
"pain_point_categories": [
|
| 118 |
+
"category1",
|
| 119 |
+
"category2",
|
| 120 |
+
"category3",
|
| 121 |
+
.
|
| 122 |
+
.
|
| 123 |
+
.
|
| 124 |
+
]
|
| 125 |
+
}}
|
| 126 |
+
2. For all the pain points categories that you got from first step, analyze the file_with_sentiment.csv data and peform pain point analysis on it and return the response in JSON format provided below for all categories, nothing else.
|
| 127 |
+
{{
|
| 128 |
+
"pain_point_analysis": {{
|
| 129 |
+
"key_insights": ["insight1", "insight2",...],
|
| 130 |
+
"pain_points": [
|
| 131 |
+
{{
|
| 132 |
+
"category1": "Category of Pain Point (e.g., Product Issues, Customer Service, Pricing)",
|
| 133 |
+
"pain_point": "Brief description of the issue (e.g., Slow Performance)",
|
| 134 |
+
"frequency": "number",
|
| 135 |
+
"sentiment_analysis": {{
|
| 136 |
+
"positive": "number",
|
| 137 |
+
"neutral": "number",
|
| 138 |
+
"negative": "number"
|
| 139 |
+
}},
|
| 140 |
+
"related_features": ["feature1", "feature2",...],
|
| 141 |
+
"examples": [
|
| 142 |
+
{{
|
| 143 |
+
"post_title": "Title of the post/comment",
|
| 144 |
+
"comment": "Description or excerpt of user comment",
|
| 145 |
+
"upvotes": "number",
|
| 146 |
+
"post_url": "URL of the original post/comment"
|
| 147 |
+
}},
|
| 148 |
+
.
|
| 149 |
+
.
|
| 150 |
+
.
|
| 151 |
+
],
|
| 152 |
+
"recommended_actions": [
|
| 153 |
+
"Recommended solution/action 1",
|
| 154 |
+
"Recommended solution/action 2",
|
| 155 |
+
.
|
| 156 |
+
.
|
| 157 |
+
.
|
| 158 |
+
]
|
| 159 |
+
}},
|
| 160 |
+
similarly for remaining categories
|
| 161 |
+
.
|
| 162 |
+
.
|
| 163 |
+
.
|
| 164 |
+
],
|
| 165 |
+
"overall_insights": {{
|
| 166 |
+
"top_pain_points": ["pain_point1", "pain_point2",...],
|
| 167 |
+
"user_segments_most_affected": ["segment1", "segment2",...],
|
| 168 |
+
"impact_on_product_development": [
|
| 169 |
+
"Insight for development 1",
|
| 170 |
+
"Insight for development 2",
|
| 171 |
+
.
|
| 172 |
+
.
|
| 173 |
+
.
|
| 174 |
+
]
|
| 175 |
+
}}
|
| 176 |
+
}}
|
| 177 |
+
}}
|
| 178 |
+
"""
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def getKeywordsPrompt(user_query):
|
| 182 |
+
return f"""1. Enhance the user query ="{user_query}" for better relevance to the main intent of query
|
| 183 |
+
get the enhanced query in JSON:
|
| 184 |
+
{{
|
| 185 |
+
"query": "enhanced_user_query"
|
| 186 |
+
}}.
|
| 187 |
+
2. Extract keywords from the enhanced query and get them in JSON:
|
| 188 |
+
{{
|
| 189 |
+
"keywords": ["keyword1", ...]
|
| 190 |
+
}}.
|
| 191 |
+
3. Create relevant combinations of 2-3 keywords for the enhanced query context.
|
| 192 |
+
4. Get 3 keyword combinations with 95% relevancy to the original user query in JSON:
|
| 193 |
+
{{
|
| 194 |
+
"top_3_combinations": ["combination phrase 1", "combination phrase 2", "combination phrase 3"]
|
| 195 |
+
}}.
|
| 196 |
+
5. Return the final output in JSON format:
|
| 197 |
+
{{
|
| 198 |
+
"query": "enhanced_user_query",
|
| 199 |
+
"keywords": ["keyword1", ...],
|
| 200 |
+
"top_3_combinations": ["combination phrase 1", "combination phrase 2", "combination phrase 3"]
|
| 201 |
+
}}.
|
| 202 |
+
"""
|
| 203 |
+
|
| 204 |
+
def getCompetitorPrompt(user_query):
|
| 205 |
+
pain_point_prompt =getPainPointAnalysisPrompt(user_query = user_query)
|
| 206 |
+
return f'''
|
| 207 |
+
1.{pain_point_prompt}
|
| 208 |
+
2. Perform competitor analysis on the given csv file and return your final output in JSON format:
|
| 209 |
+
{{
|
| 210 |
+
"competitor_analysis": {{
|
| 211 |
+
"competitor_name": "<CompetitorName>",
|
| 212 |
+
"overview": {{
|
| 213 |
+
"date_range": "<Start_Date> to <End_Date>",
|
| 214 |
+
"total_posts_analyzed": "<Total_Posts>",
|
| 215 |
+
"total_comments_analyzed": "<Total_Comments>"
|
| 216 |
+
}},
|
| 217 |
+
"market_sentiment": {{
|
| 218 |
+
"overall": {{
|
| 219 |
+
"positive": "<Positive_Percentage>",
|
| 220 |
+
"neutral": "<Neutral_Percentage>",
|
| 221 |
+
"negative": "<Negative_Percentage>"
|
| 222 |
+
}},
|
| 223 |
+
"trend_over_time": {{
|
| 224 |
+
"<Year-Month>": {{
|
| 225 |
+
"positive": "<Positive_Percentage>",
|
| 226 |
+
"neutral": "<Neutral_Percentage>",
|
| 227 |
+
"negative": "<Negative_Percentage>"
|
| 228 |
+
}},
|
| 229 |
+
"<Year-Month>": {{
|
| 230 |
+
"positive": "<Positive_Percentage>",
|
| 231 |
+
"neutral": "<Neutral_Percentage>",
|
| 232 |
+
"negative": "<Negative_Percentage>"
|
| 233 |
+
}}
|
| 234 |
+
}}
|
| 235 |
+
}},
|
| 236 |
+
"pain_points": {{pain point from step 1}},
|
| 237 |
+
"features_and_differentiators": [
|
| 238 |
+
{{
|
| 239 |
+
"feature": "<Feature_Name>",
|
| 240 |
+
"sentiment": "<Sentiment_Type>",
|
| 241 |
+
"mentions": "<Mentions>",
|
| 242 |
+
"related_comments": [
|
| 243 |
+
{{
|
| 244 |
+
"comment": "<Comment_Text>",
|
| 245 |
+
"upvotes": "<Upvotes>",
|
| 246 |
+
"post_url": "<Post_URL>"
|
| 247 |
+
}}
|
| 248 |
+
.
|
| 249 |
+
.
|
| 250 |
+
.
|
| 251 |
+
]
|
| 252 |
+
}}
|
| 253 |
+
.
|
| 254 |
+
.
|
| 255 |
+
.
|
| 256 |
+
],
|
| 257 |
+
"sentiment_by_feature": {{
|
| 258 |
+
"<Feature_Name>": {{
|
| 259 |
+
"positive": "<Positive_Percentage>",
|
| 260 |
+
"neutral": "<Neutral_Percentage>",
|
| 261 |
+
"negative": "<Negative_Percentage>"
|
| 262 |
+
}},
|
| 263 |
+
"<Feature_Name>": {{
|
| 264 |
+
"positive": "<Positive_Percentage>",
|
| 265 |
+
"neutral": "<Neutral_Percentage>",
|
| 266 |
+
"negative": "<Negative_Percentage>"
|
| 267 |
+
}}
|
| 268 |
+
}},
|
| 269 |
+
"audience_analysis": {{
|
| 270 |
+
"popular_subreddits": [
|
| 271 |
+
"<Subreddit_1>",
|
| 272 |
+
"<Subreddit_2>"
|
| 273 |
+
],
|
| 274 |
+
"user_segments": [
|
| 275 |
+
"<User_Segment_1>",
|
| 276 |
+
"<User_Segment_2>"
|
| 277 |
+
]
|
| 278 |
+
}},
|
| 279 |
+
"pricing_feedback": {{
|
| 280 |
+
"value_perception": {{
|
| 281 |
+
"positive": "<Positive_Percentage>",
|
| 282 |
+
"neutral": "<Neutral_Percentage>",
|
| 283 |
+
"negative": "<Negative_Percentage>"
|
| 284 |
+
}},
|
| 285 |
+
"related_comments": [
|
| 286 |
+
{{
|
| 287 |
+
"comment": "<Comment_Text>",
|
| 288 |
+
"upvotes": "<Upvotes>",
|
| 289 |
+
"post_url": "<Post_URL>"
|
| 290 |
+
}}
|
| 291 |
+
]
|
| 292 |
+
}},
|
| 293 |
+
"competitor_strengths": [
|
| 294 |
+
"<Strength_1>",
|
| 295 |
+
"<Strength_2>",
|
| 296 |
+
.
|
| 297 |
+
.
|
| 298 |
+
.
|
| 299 |
+
],
|
| 300 |
+
"competitor_weaknesses": [
|
| 301 |
+
"<Weakness_1>",
|
| 302 |
+
"<Weakness_2>",
|
| 303 |
+
.
|
| 304 |
+
.
|
| 305 |
+
.
|
| 306 |
+
],
|
| 307 |
+
"user_recommendations": [
|
| 308 |
+
"<Recommendation_1>",
|
| 309 |
+
"<Recommendation_2>",
|
| 310 |
+
.
|
| 311 |
+
.
|
| 312 |
+
.
|
| 313 |
+
],
|
| 314 |
+
"competitive_strategy": {{
|
| 315 |
+
"pricing_strategy": "<Pricing_Strategy>",
|
| 316 |
+
"feature_improvement": "<Feature_Strategy>",
|
| 317 |
+
.
|
| 318 |
+
.
|
| 319 |
+
.
|
| 320 |
+
}}
|
| 321 |
+
}}
|
| 322 |
+
}}
|
| 323 |
+
|
| 324 |
+
'''
|
| 325 |
+
|
| 326 |
+
def getTop10CompetitorPrompt(reddit_data,gemini_data,user_query):
|
| 327 |
+
return f'''
|
| 328 |
+
Competitor names from reddit are:
|
| 329 |
+
{reddit_data}
|
| 330 |
+
|
| 331 |
+
Competitor name from gemini are:
|
| 332 |
+
{gemini_data}
|
| 333 |
+
|
| 334 |
+
both are for the user query: "{user_query}"
|
| 335 |
+
|
| 336 |
+
get me 10 most relevant competitors for the given user query from both the data and return a combined list in following json format, nothing else and try not to include very general competitors into the list which are not directly related to user query, be specific according to the user query. competitor1 ,2,..10 should be the details given in the json data:
|
| 337 |
+
{{
|
| 338 |
+
list:[competitor1,competitor2,....competitor10]
|
| 339 |
+
}}
|
| 340 |
+
'''
|
venv/reddit/reddit_call.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Reddit Data Scrapper
|
| 2 |
+
from reddit_utils import get_microseconds_list
|
| 3 |
+
from reddit_pain_point_analysis import pain_point_analysis
|
| 4 |
+
from reddit_sentiment_analysis import SentimentAnalysis
|
| 5 |
+
from reddit_gemini import getKeywords
|
| 6 |
+
from reddit_search_scrapper import getCompetitorAnalysisData, getFinalData
|
| 7 |
+
import google.generativeai as genai
|
| 8 |
+
from scraping import driver, getPostComments, getSearchPostData
|
| 9 |
+
from api_keys import api_key
|
| 10 |
+
genai.configure(api_key=api_key)
|
| 11 |
+
|
| 12 |
+
user_query = "AI image"
|
| 13 |
+
|
| 14 |
+
def redditScrapper(user_query):
|
| 15 |
+
search_keywords=getKeywords(user_query=user_query)
|
| 16 |
+
# unique_list = get_microseconds_list()
|
| 17 |
+
# for i in range(len(search_keywords["top_3_combinations"])):
|
| 18 |
+
# getSearchPostData( search_keyword=search_keywords['top_3_combinations'][i],index=unique_list[i])
|
| 19 |
+
|
| 20 |
+
# getFinalData(user_query=user_query)
|
| 21 |
+
# getPostComments()
|
| 22 |
+
sentiment_instance= SentimentAnalysis()
|
| 23 |
+
sentiment_instance.generate_sentiment_and_emotion_from_data(fileName='posts_data_1732105228633815.csv')
|
| 24 |
+
# sentiment_data.to_csv("file_with_sentiment.csv", index=False)
|
| 25 |
+
# # Specify the file path
|
| 26 |
+
# file_path = 'posts_data.csv'
|
| 27 |
+
|
| 28 |
+
# # Check if the file exists before attempting to delete
|
| 29 |
+
# if os.path.exists(file_path):
|
| 30 |
+
# os.remove(file_path)
|
| 31 |
+
# print("File deleted successfully")
|
| 32 |
+
# else:
|
| 33 |
+
# print("File does not exist")
|
| 34 |
+
# pain_point_analysis(user_query = "AI image generation techniques and applications", fileName="posts_data.csv")
|
| 35 |
+
getCompetitorAnalysisData(user_query="social media analytics tools and strategies")
|
| 36 |
+
|
| 37 |
+
driver.quit()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# Call the function
|
| 41 |
+
redditScrapper(user_query)
|
| 42 |
+
|
venv/reddit/reddit_community_post_scraper.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import praw
|
| 2 |
+
|
| 3 |
+
reddit = praw.Reddit(
|
| 4 |
+
client_id="yjGfys3QZPpdCpNZl25Kig",
|
| 5 |
+
client_secret="dqoc8LrQBQhB_IgjV-lKyrD9lBPftg",
|
| 6 |
+
password="&honeyB90",
|
| 7 |
+
user_agent="Curious",
|
| 8 |
+
username="Final-Difference7055",
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
subRed = reddit.subreddit("SkincareAddiction")
|
| 12 |
+
data=[]
|
| 13 |
+
for posts in subRed.hot(limit=25):
|
| 14 |
+
data.append([posts.title,posts.selftext,f"https://www.reddit.com/r/{subRed}/comments/{posts.id}"])
|
| 15 |
+
|
| 16 |
+
def process_comment(comment):
|
| 17 |
+
# Prepare the comment data
|
| 18 |
+
comment_data = {
|
| 19 |
+
"user": comment.author.name if comment.author else "Unknown",
|
| 20 |
+
"comment": comment.body,
|
| 21 |
+
"replies": [] # Initialize replies list
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
# Process replies recursively if any
|
| 25 |
+
if comment.replies:
|
| 26 |
+
for reply in comment.replies:
|
| 27 |
+
reply_data = process_comment(reply) # Recursive call for replies
|
| 28 |
+
comment_data["replies"].append(reply_data)
|
| 29 |
+
|
| 30 |
+
return comment_data
|
| 31 |
+
for url in range(len(data)):
|
| 32 |
+
# comment_data_sub=[]
|
| 33 |
+
submission = reddit.submission(url=data[url][2])
|
| 34 |
+
|
| 35 |
+
# Fetch and process comments
|
| 36 |
+
submission.comments.replace_more(limit=2) # Use limit=0 to get all comments
|
| 37 |
+
comments_data = []
|
| 38 |
+
|
| 39 |
+
# Function to process a comment and its replies
|
| 40 |
+
# Seed with top-level comments
|
| 41 |
+
comment_queue = list(submission.comments)
|
| 42 |
+
|
| 43 |
+
while comment_queue:
|
| 44 |
+
comment = comment_queue.pop(0)
|
| 45 |
+
comment_data = process_comment(comment) # Process each comment
|
| 46 |
+
comments_data.append(comment_data)
|
| 47 |
+
|
| 48 |
+
# Now, structure the data into the desired JSON format
|
| 49 |
+
json_output = {
|
| 50 |
+
"comments": comments_data
|
| 51 |
+
}
|
| 52 |
+
data[url].append(json_output)
|
| 53 |
+
|
| 54 |
+
new_data=[]
|
| 55 |
+
for i in range(len(data)):
|
| 56 |
+
new_data.append([data[i][0],data[i][1],data[i][3]])
|
| 57 |
+
import pandas as pd
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# Convert the list to a DataFrame and specify column names
|
| 61 |
+
df = pd.DataFrame(new_data, columns=["Title", "Description", "Comments"])
|
| 62 |
+
|
| 63 |
+
# Save the DataFrame as a CSV file
|
| 64 |
+
df.to_csv('output.csv', index=False)
|
| 65 |
+
|
| 66 |
+
print("Data saved to output.csv")
|
venv/reddit/reddit_competitor_analysis.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import google.generativeai as genai
|
| 4 |
+
|
| 5 |
+
from reddit.prompts import getCompetitorPrompt, getTop10CompetitorPrompt
|
| 6 |
+
from reddit.reddit_utils import get_microseconds_list
|
| 7 |
+
from reddit.scraping import getPostComments, getSearchPostData
|
| 8 |
+
from reddit.reddit_gemini import getModelAndGenerationConfigCommon
|
| 9 |
+
from reddit.api_keys import api_key3,api_key4,api_key5,api_key6,api_key7,api_key8
|
| 10 |
+
|
| 11 |
+
def getCompetitorNames(user_query ): #model, generation_config
|
| 12 |
+
prompt=f"""Extract a list of product names, alternatives, and competitors relevant to the query: {user_query}. Ensure that the results focus on tools, platforms, or services explicitly aligned with the domain and purpose of the query. Avoid including general or loosely related products unless they directly offer features tailored to the query's intent.
|
| 13 |
+
|
| 14 |
+
Additionally, provide the platform(s) (e.g., web, apps, integrations) on which each competitor operates and categorize their functionality. Include a frequency count indicating the number of times each entry is mentioned or relevant to the query. Also, aggregate the total frequency of each platform across all entries. Also give provide popularity score for each competitor out of 100.
|
| 15 |
+
give top 6 competitors details only.
|
| 16 |
+
return in given json format only:
|
| 17 |
+
{{
|
| 18 |
+
"competitors":[{{"name":"","platform":[],"category":"","count":number,"popularity":number}}],
|
| 19 |
+
"platforms":[{{"platform":name,count:number}}]
|
| 20 |
+
}}"""
|
| 21 |
+
generation_config = {
|
| 22 |
+
"temperature": 1,
|
| 23 |
+
"top_p": 0.95,
|
| 24 |
+
"top_k": 40,
|
| 25 |
+
"max_output_tokens": 8192,
|
| 26 |
+
"response_mime_type": "application/json",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
model = genai.GenerativeModel(
|
| 30 |
+
model_name="gemini-1.5-pro-002",
|
| 31 |
+
generation_config=generation_config,
|
| 32 |
+
)
|
| 33 |
+
try:
|
| 34 |
+
response = model.generate_content(prompt)
|
| 35 |
+
data = response.text
|
| 36 |
+
print("getCompetitorNames",data)
|
| 37 |
+
return json.loads(data)
|
| 38 |
+
except:
|
| 39 |
+
try:
|
| 40 |
+
# retry
|
| 41 |
+
response = model.generate_content(prompt)
|
| 42 |
+
data = response.text
|
| 43 |
+
print("retry getCompetitorNames",data)
|
| 44 |
+
return json.loads(data)
|
| 45 |
+
except Exception as e:
|
| 46 |
+
return {"details": str(e)}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def getCompetitorNamesFromReddit(user_query,fileName,isSolo=True,last_chat_session=None ): #model, generation_config
|
| 50 |
+
prompt=f"""Extract a list of product names, alternatives, and competitors relevant to the query: {user_query} from the given csv data. Ensure that the results focus on tools, platforms, or services explicitly aligned with the domain and purpose of the query and do not include very general competitors into the list which are not directly related to user query use case or intent.
|
| 51 |
+
|
| 52 |
+
Additionally, provide the platform(s) (e.g., web, apps, integrations) on which each competitor operates and categorize their functionality.Category should be string. Include a frequency count indicating the number of times each entry is mentioned or relevant to the query. Also, aggregate the total frequency of each platform across all entries. Also give provide popularity score for each competitor out of 100.
|
| 53 |
+
give top 6 competitors details only.
|
| 54 |
+
return in given json format only:
|
| 55 |
+
{{
|
| 56 |
+
"competitors":[{{"name":"","platform":[],"category":"","count":number,"popularity":number}}],
|
| 57 |
+
"platforms":[{{"platform":name,count:number}}]
|
| 58 |
+
}}"""
|
| 59 |
+
chat_session=None
|
| 60 |
+
if isSolo:
|
| 61 |
+
data=getModelAndGenerationConfigCommon(fileName=fileName,isFlash=False)
|
| 62 |
+
model = data[0]
|
| 63 |
+
chat_session = model.start_chat(
|
| 64 |
+
history=[
|
| 65 |
+
{
|
| 66 |
+
"role": "user",
|
| 67 |
+
"parts": [
|
| 68 |
+
data[1],
|
| 69 |
+
prompt
|
| 70 |
+
],
|
| 71 |
+
}
|
| 72 |
+
]
|
| 73 |
+
)
|
| 74 |
+
else:
|
| 75 |
+
chat_session = last_chat_session
|
| 76 |
+
try:
|
| 77 |
+
response = chat_session.send_message("give your last response of competitor names")
|
| 78 |
+
|
| 79 |
+
data = response.text
|
| 80 |
+
print("getCompetitorNames",data)
|
| 81 |
+
return json.loads(data)
|
| 82 |
+
except:
|
| 83 |
+
try:
|
| 84 |
+
# retry
|
| 85 |
+
response = chat_session.send_message("give your last response of competitor names")
|
| 86 |
+
data = response.text
|
| 87 |
+
print("retry getCompetitorNames",data)
|
| 88 |
+
return json.loads(data)
|
| 89 |
+
except Exception as e:
|
| 90 |
+
return {"details": str(e)}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# def top10competitor(reddit_data,gemini_data):
|
| 94 |
+
# # ensure path 1 is reddit and path 2 is gemini
|
| 95 |
+
# d1 = reddit_data
|
| 96 |
+
# d2 = gemini_data
|
| 97 |
+
# popularity = {}
|
| 98 |
+
# count = {}
|
| 99 |
+
# for x in d1['competitors']:count[x['name']] = x['count']
|
| 100 |
+
# for x in d2['competitors']:
|
| 101 |
+
# if x['name'] in count.keys(): continue
|
| 102 |
+
# else:popularity[x['name']] = x['popularity']
|
| 103 |
+
# ma = sum(count.values())/len(count.values())
|
| 104 |
+
# mb = sum(popularity.values())/len(popularity.values())
|
| 105 |
+
# df = pd.DataFrame(d1['competitors'])
|
| 106 |
+
# df['extracted_by'] = 'reddit'
|
| 107 |
+
# df2 = pd.DataFrame(d2['competitors'])
|
| 108 |
+
# df2['extracted_by'] = 'gemini'
|
| 109 |
+
# df2['count'] = df2['popularity']/(mb/ma)
|
| 110 |
+
# df = pd.concat([df, df2], axis=0)
|
| 111 |
+
# df = df.drop_duplicates('name')
|
| 112 |
+
# df = df.reset_index(drop=True)
|
| 113 |
+
# df = df.sort_values(by ='count', ascending=False)
|
| 114 |
+
# df = df.reset_index(drop=True)
|
| 115 |
+
# df = df.head(10)
|
| 116 |
+
# df = df.drop('extracted_by', axis=1)
|
| 117 |
+
# return df
|
| 118 |
+
|
| 119 |
+
def getTop10Competitors(user_query,reddit_data,gemini_data):
|
| 120 |
+
prompt = getTop10CompetitorPrompt(user_query=user_query,reddit_data=reddit_data,gemini_data=gemini_data)
|
| 121 |
+
model = genai.GenerativeModel("gemini-exp-1114")
|
| 122 |
+
|
| 123 |
+
generation_config = genai.GenerationConfig(response_mime_type="application/json")
|
| 124 |
+
|
| 125 |
+
try:
|
| 126 |
+
response = model.generate_content(prompt, generation_config=generation_config) # Adjust if the library supports async
|
| 127 |
+
data = response.text
|
| 128 |
+
print("getTop10Competitors:", data)
|
| 129 |
+
return json.loads(data)
|
| 130 |
+
except Exception as e:
|
| 131 |
+
print("Error while fetching getTop10Competitors: %s", e)
|
| 132 |
+
|
| 133 |
+
def getPostDataofCompetitor(fileName,user_query):
|
| 134 |
+
df = fileName
|
| 135 |
+
unique_list = get_microseconds_list(length=len(df))
|
| 136 |
+
actual_list=[]
|
| 137 |
+
for i in range(len(df)):
|
| 138 |
+
index=getSearchPostData(forCompetitorAnalysis=True,search_keyword=f"{df.iloc[i]['name']} {df.iloc[i]['category']}",name=df.iloc[i]['name'] ,index=unique_list[i])
|
| 139 |
+
if index is not None:
|
| 140 |
+
actual_list.append(index)
|
| 141 |
+
print("actual_list",actual_list)
|
| 142 |
+
print("Fetched data for competitors")
|
| 143 |
+
fileNames = [f"posts_data_{actual_list[i]}.csv" for i in range(len(actual_list))]
|
| 144 |
+
return preprocessingCompetitorsData(user_query=user_query,fileNames=fileNames,fileUniqueIds=actual_list)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def preprocessingCompetitorsData(user_query,fileNames,fileUniqueIds):
|
| 148 |
+
c=0
|
| 149 |
+
competitors_json_data = []
|
| 150 |
+
for i in range(len(fileUniqueIds)):
|
| 151 |
+
if c==6:break
|
| 152 |
+
print(f"Processing file {fileNames[i]}")
|
| 153 |
+
# get posts comments data
|
| 154 |
+
getPostComments(fileName=fileNames[i])
|
| 155 |
+
json_data = getCompetitorAnalysisReport(user_query=user_query,fileName=fileNames[i],count=c)
|
| 156 |
+
c+=1
|
| 157 |
+
# if json_data does contain "details" field, then skip this file
|
| 158 |
+
if "details" in json_data.keys():
|
| 159 |
+
continue
|
| 160 |
+
# save json_data to json file
|
| 161 |
+
with open(f"competitor_analysis_report_{fileUniqueIds[i]}.json", "w") as outfile:
|
| 162 |
+
json.dump(json_data, outfile)
|
| 163 |
+
print("Competitor Analysis Report",f"competitor_analysis_report_{fileUniqueIds[i]}.json")
|
| 164 |
+
competitors_json_data.append(json_data)
|
| 165 |
+
|
| 166 |
+
for file_path in fileNames:
|
| 167 |
+
# Check if the file exists before attempting to delete
|
| 168 |
+
if os.path.exists(file_path):
|
| 169 |
+
os.remove(file_path)
|
| 170 |
+
print("File deleted successfully")
|
| 171 |
+
else:
|
| 172 |
+
print("File does not exist")
|
| 173 |
+
return competitors_json_data
|
| 174 |
+
|
| 175 |
+
def getCompetitorAnalysisReport(user_query,fileName,count=0):
|
| 176 |
+
prompt = getCompetitorPrompt(user_query=user_query)
|
| 177 |
+
api_key_map = {
|
| 178 |
+
0: api_key3,
|
| 179 |
+
1: api_key4,
|
| 180 |
+
2: api_key5,
|
| 181 |
+
3: api_key6,
|
| 182 |
+
4: api_key7,
|
| 183 |
+
5: api_key8
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
selected_api_key = api_key_map.get(count, api_key8) # Default to api_key8 if count > 5
|
| 187 |
+
genai.configure(api_key=selected_api_key)
|
| 188 |
+
data = getModelAndGenerationConfigCommon(fileName=fileName,isFlash=False)
|
| 189 |
+
model = data[0]
|
| 190 |
+
chat_session = model.start_chat(
|
| 191 |
+
history=[
|
| 192 |
+
{
|
| 193 |
+
"role": "user",
|
| 194 |
+
"parts": [
|
| 195 |
+
data[1],
|
| 196 |
+
prompt
|
| 197 |
+
],
|
| 198 |
+
}
|
| 199 |
+
]
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
try:
|
| 204 |
+
response = chat_session.send_message("give your last response of competitor analysis")
|
| 205 |
+
data = response.text
|
| 206 |
+
print("getCompetitorNames",data)
|
| 207 |
+
return json.loads(data)
|
| 208 |
+
except:
|
| 209 |
+
try:
|
| 210 |
+
# retry
|
| 211 |
+
response = chat_session.send_message("give your last response of competitor analysis")
|
| 212 |
+
data = response.text
|
| 213 |
+
print("retry getCompetitorNames",data)
|
| 214 |
+
return json.loads(data)
|
| 215 |
+
except Exception as e:
|
| 216 |
+
print("competitor analysis error",str(e))
|
| 217 |
+
return {"details": str(e)}
|
venv/reddit/reddit_functions.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from reddit.reddit_search_scrapper import getFinalData
|
| 3 |
+
from reddit.reddit_sentiment_analysis import SentimentAnalysis
|
| 4 |
+
from reddit.reddit_utils import get_microseconds_list
|
| 5 |
+
from reddit.scraping import getPostComments, getSearchPostData
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def getRedditData(user_query, search_keywords):
|
| 9 |
+
unique_list = get_microseconds_list()
|
| 10 |
+
|
| 11 |
+
successful_steps = []
|
| 12 |
+
|
| 13 |
+
# Step 1: Get search post data
|
| 14 |
+
for i in range(len(search_keywords)):
|
| 15 |
+
try:
|
| 16 |
+
getSearchPostData(search_keyword=search_keywords[i], index=unique_list[i])
|
| 17 |
+
successful_steps.append(('getSearchPostData', i)) # Mark this step as successful
|
| 18 |
+
except Exception as e:
|
| 19 |
+
print(f"Failed at getSearchPostData for keyword {search_keywords[i]}: {e}")
|
| 20 |
+
|
| 21 |
+
# Step 2: Generate file names
|
| 22 |
+
fileNames = [f"posts_data_{unique_list[i]}.csv" for i in range(len(unique_list))]
|
| 23 |
+
|
| 24 |
+
# Step 3: Get final data
|
| 25 |
+
try:
|
| 26 |
+
getFinalData(user_query=user_query, filesNames=fileNames)
|
| 27 |
+
successful_steps.append(('getFinalData',)) # Mark this step as successful
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"Failed at getFinalData: {e}")
|
| 30 |
+
|
| 31 |
+
# Step 4: Get post comments
|
| 32 |
+
try:
|
| 33 |
+
getPostComments(fileName=fileNames[0])
|
| 34 |
+
successful_steps.append(('getPostComments',)) # Mark this step as successful
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f"Failed at getPostComments: {e}")
|
| 37 |
+
# Step 5: Get sentiment of post comments
|
| 38 |
+
try:
|
| 39 |
+
sentiment_instance = SentimentAnalysis()
|
| 40 |
+
sentiment_instance.generate_sentiment_and_emotion_from_data(fileName=fileNames[0])
|
| 41 |
+
successful_steps.append(('getPostSentiment',)) # Mark this step as successful
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"Failed at getPostSentiment: {e}")
|
| 44 |
+
# Optionally, return the successful steps for logging or further processing
|
| 45 |
+
return {
|
| 46 |
+
"fileName":fileNames[0],
|
| 47 |
+
"fileUniqueId":str(unique_list[0]),
|
| 48 |
+
"successful_steps": successful_steps
|
| 49 |
+
}
|
venv/reddit/reddit_gemini.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import logging
|
| 3 |
+
import time
|
| 4 |
+
import google.generativeai as genai
|
| 5 |
+
|
| 6 |
+
from reddit.prompts import getKeywordsPrompt
|
| 7 |
+
|
| 8 |
+
def getKeywords(user_query: str):
|
| 9 |
+
prompt = getKeywordsPrompt(user_query)
|
| 10 |
+
model = genai.GenerativeModel("gemini-exp-1114")
|
| 11 |
+
|
| 12 |
+
generation_config = genai.GenerationConfig(response_mime_type="application/json")
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
response = model.generate_content(prompt, generation_config=generation_config) # Adjust if the library supports async
|
| 16 |
+
data = response.text
|
| 17 |
+
logging.info("Enhanced user query: %s", data)
|
| 18 |
+
return json.loads(data)
|
| 19 |
+
except Exception as e:
|
| 20 |
+
logging.error("Error while fetching keywords: %s", e)
|
| 21 |
+
# raise HTTPException(status_code=500, detail="Error processing request")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def upload_to_gemini(path, mime_type=None):
|
| 25 |
+
file = genai.upload_file(path, mime_type=mime_type)
|
| 26 |
+
print(f"Uploaded file '{file.display_name}' as: {file.uri}")
|
| 27 |
+
return file
|
| 28 |
+
|
| 29 |
+
def wait_for_files_active(files):
|
| 30 |
+
"""Waits for the given files to be active.
|
| 31 |
+
|
| 32 |
+
Some files uploaded to the Gemini API need to be processed before they can be
|
| 33 |
+
used as prompt inputs. The status can be seen by querying the file's "state"
|
| 34 |
+
field.
|
| 35 |
+
|
| 36 |
+
This implementation uses a simple blocking polling loop. Production code
|
| 37 |
+
should probably employ a more sophisticated approach.
|
| 38 |
+
"""
|
| 39 |
+
print("Waiting for file processing...")
|
| 40 |
+
for name in (file.name for file in files):
|
| 41 |
+
file = genai.get_file(name)
|
| 42 |
+
while file.state.name == "PROCESSING":
|
| 43 |
+
print(".", end="", flush=True)
|
| 44 |
+
time.sleep(10)
|
| 45 |
+
file = genai.get_file(name)
|
| 46 |
+
print("file.state.name", file.state.name)
|
| 47 |
+
if file.state.name != "ACTIVE":
|
| 48 |
+
raise Exception(f"File {file.name} failed to process")
|
| 49 |
+
print("...all files ready")
|
| 50 |
+
print()
|
| 51 |
+
def getModelAndGenerationConfigCommon(fileName, isFlash=True):
|
| 52 |
+
generation_config = {
|
| 53 |
+
"temperature": 1,
|
| 54 |
+
"top_p": 0.95,
|
| 55 |
+
"top_k": 40,
|
| 56 |
+
"max_output_tokens": 8192,
|
| 57 |
+
"response_mime_type": "application/json",
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
model = genai.GenerativeModel(
|
| 61 |
+
model_name="gemini-1.5-flash" if isFlash else "gemini-1.5-pro-002",
|
| 62 |
+
generation_config=generation_config,
|
| 63 |
+
)
|
| 64 |
+
files = [
|
| 65 |
+
upload_to_gemini(fileName, mime_type="text/csv"),
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
# Some files have a processing delay. Wait for them to be ready.
|
| 69 |
+
wait_for_files_active(files)
|
| 70 |
+
return [model,files[0]]
|
venv/reddit/reddit_pain_point_analysis.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import google.generativeai as genai
|
| 3 |
+
|
| 4 |
+
from reddit.prompts import getPainPointAnalysisPrompt
|
| 5 |
+
from reddit.reddit_gemini import upload_to_gemini, wait_for_files_active
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def pain_point_analysis(user_query,fileName,uniqueFileId):
|
| 9 |
+
pain_point_prompt = getPainPointAnalysisPrompt(user_query = user_query)
|
| 10 |
+
generation_config = genai.GenerationConfig(response_mime_type="application/json") # Request JSON response
|
| 11 |
+
model = genai.GenerativeModel(
|
| 12 |
+
model_name="gemini-1.5-pro-002",
|
| 13 |
+
generation_config=generation_config,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
files = [
|
| 17 |
+
upload_to_gemini(fileName, mime_type="text/csv"),
|
| 18 |
+
]
|
| 19 |
+
# Some files have a processing delay. Wait for them to be ready.
|
| 20 |
+
wait_for_files_active(files)
|
| 21 |
+
chat_session = model.start_chat(
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
chat_session.history = [
|
| 25 |
+
{
|
| 26 |
+
"role": "user",
|
| 27 |
+
"parts": [
|
| 28 |
+
files[0],
|
| 29 |
+
pain_point_prompt
|
| 30 |
+
]
|
| 31 |
+
}
|
| 32 |
+
]
|
| 33 |
+
response = chat_session.send_message("give your pain point analysis output json as it is.")
|
| 34 |
+
data = response.text
|
| 35 |
+
try:
|
| 36 |
+
print("pain point analysis output",data)
|
| 37 |
+
json_data = json.loads(data)
|
| 38 |
+
with open(f'pain_point_analysis_{uniqueFileId}.json', 'w') as json_file:
|
| 39 |
+
json.dump(json_data, json_file, indent=4)
|
| 40 |
+
return [json_data,chat_session]
|
| 41 |
+
except:
|
| 42 |
+
try:
|
| 43 |
+
response = chat_session.send_message("give your pain point analysis output json as it is.")
|
| 44 |
+
print("retried pain point analysis output",data)
|
| 45 |
+
json_data = json.loads(data)
|
| 46 |
+
with open(f'pain_point_analysis_{uniqueFileId}.json', 'w') as json_file:
|
| 47 |
+
json.dump(json_data, json_file, indent=4)
|
| 48 |
+
return [json_data,chat_session]
|
| 49 |
+
except:
|
| 50 |
+
json_data = {"details": "something went wrong"}
|
| 51 |
+
return [json_data,chat_session]
|
venv/reddit/reddit_search_scrapper.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
'''
|
| 2 |
+
Reddit data pre processing code only.
|
| 3 |
+
'''
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from reddit.reddit_competitor_analysis import getCompetitorNames, getCompetitorNamesFromReddit, getPostDataofCompetitor, getTop10Competitors
|
| 8 |
+
from reddit.reddit_utils import topic_sort
|
| 9 |
+
|
| 10 |
+
def preProcessPostData(filesNames):
|
| 11 |
+
for i in filesNames:
|
| 12 |
+
df=pd.read_csv(i)
|
| 13 |
+
df.drop_duplicates(subset=["title"],inplace=True)
|
| 14 |
+
# drop rows with comment_count =0
|
| 15 |
+
df=df[df["comment_count"]!=0]
|
| 16 |
+
# drop index column
|
| 17 |
+
df.drop(columns=["index"], inplace=True)
|
| 18 |
+
indexes=[i for i in range(len(df))]
|
| 19 |
+
# insert index column
|
| 20 |
+
df.insert(0, "index", indexes)
|
| 21 |
+
df.to_csv(i, index=False)
|
| 22 |
+
|
| 23 |
+
def getFinalData(user_query,filesNames):
|
| 24 |
+
preProcessPostData(filesNames=filesNames)
|
| 25 |
+
# files_name=["posts_data_0.csv","posts_data_1.csv","posts_data_2.csv"]
|
| 26 |
+
final_df = topic_sort(filesNames[0], filesNames[1], filesNames[2], user_query)
|
| 27 |
+
for file_path in filesNames:
|
| 28 |
+
# Check if the file exists before attempting to delete
|
| 29 |
+
if os.path.exists(file_path):
|
| 30 |
+
os.remove(file_path)
|
| 31 |
+
print("File deleted successfully")
|
| 32 |
+
else:
|
| 33 |
+
print("File does not exist")
|
| 34 |
+
final_df.to_csv(filesNames[0], index=False)
|
| 35 |
+
|
| 36 |
+
print("Data saved to ",filesNames[0])
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def getCompetitorAnalysisData(user_query,fileName,isSolo=True,chat_session=None):
|
| 40 |
+
json_data_gemini = getCompetitorNames(user_query=user_query)
|
| 41 |
+
# with open('competitors_names_gemini.json', 'w') as json_file:
|
| 42 |
+
# json.dump(json_data, json_file, indent=4)
|
| 43 |
+
json_data_reddit = getCompetitorNamesFromReddit(user_query=user_query,isSolo=isSolo,last_chat_session=chat_session,fileName=fileName)
|
| 44 |
+
# with open('competitors_names_reddit.json', 'w') as json_file:
|
| 45 |
+
# json.dump(json_data, json_file, indent=4)
|
| 46 |
+
data = getTop10Competitors(gemini_data=json_data_gemini,reddit_data=json_data_reddit,user_query=user_query)
|
| 47 |
+
df = pd.DataFrame(data["list"])
|
| 48 |
+
# print("Data saved to competitors_names.csv")
|
| 49 |
+
competitors_data = getPostDataofCompetitor(user_query=user_query,fileName=df)
|
| 50 |
+
|
| 51 |
+
return {
|
| 52 |
+
"competitors_data": competitors_data,
|
| 53 |
+
"all_competitor_data": data["list"]
|
| 54 |
+
}
|
| 55 |
+
|
venv/reddit/reddit_sentiment_analysis.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import ast
|
| 3 |
+
from transformers import BertTokenizer, BertForSequenceClassification
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
output=pd.DataFrame()
|
| 6 |
+
class SentimentAnalysis:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)
|
| 9 |
+
self.finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
|
| 10 |
+
self.tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
|
| 11 |
+
def process_comment(self,comment):
|
| 12 |
+
|
| 13 |
+
sentence=[comment['comment'][:512]]
|
| 14 |
+
model_outputs = self.classifier(sentence)
|
| 15 |
+
# Prepare the comment data
|
| 16 |
+
comment_data = {
|
| 17 |
+
"comment": comment['comment'],
|
| 18 |
+
'emotion':model_outputs[0][:3],
|
| 19 |
+
"replies": [] # Initialize replies list
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
# Process replies recursively if any
|
| 23 |
+
if comment['replies']:
|
| 24 |
+
for reply in comment['replies']:
|
| 25 |
+
|
| 26 |
+
reply_data = self.process_comment(reply) # Recursive call for replies
|
| 27 |
+
comment_data["replies"].append(reply_data)
|
| 28 |
+
|
| 29 |
+
return comment_data
|
| 30 |
+
def generate_sentiment_and_emotion_from_data(self,fileName):
|
| 31 |
+
df = pd.read_csv(fileName)
|
| 32 |
+
comments_data=[]
|
| 33 |
+
for i in range(df.shape[0]):
|
| 34 |
+
row=df.iloc[i]
|
| 35 |
+
commentary=(ast.literal_eval(row['comments']))
|
| 36 |
+
commentary=commentary['comments']
|
| 37 |
+
while commentary:
|
| 38 |
+
comment = commentary.pop(0)
|
| 39 |
+
comment_data = self.process_comment(comment)
|
| 40 |
+
comments_data.append(comment_data)
|
| 41 |
+
json_output = {
|
| 42 |
+
"comments": comments_data
|
| 43 |
+
}
|
| 44 |
+
subset_data = df.iloc[i].copy()
|
| 45 |
+
|
| 46 |
+
# Modify the subset
|
| 47 |
+
subset_data['comments'] = json_output
|
| 48 |
+
|
| 49 |
+
# Assign back if needed
|
| 50 |
+
df.iloc[i] = subset_data
|
| 51 |
+
df.to_csv(fileName, index=False)
|
| 52 |
+
print("Sentiment Data saved to",fileName)
|
| 53 |
+
|
venv/reddit/reddit_utils.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
|
| 6 |
+
def get_microseconds_list(length=3):
|
| 7 |
+
# Get the current time in microseconds
|
| 8 |
+
microseconds = int(time.time() * 1_000_000)
|
| 9 |
+
|
| 10 |
+
# Create a list with three microseconds
|
| 11 |
+
return [microseconds + i for i in range(length)]
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def topic_sort(path1,query, path2='', path3='',isForCompetitorAnalysis=False):
|
| 15 |
+
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 16 |
+
if isForCompetitorAnalysis==True:
|
| 17 |
+
df=pd.read_csv(path1)
|
| 18 |
+
else:
|
| 19 |
+
df0 = pd.read_csv(path1)
|
| 20 |
+
df1 = pd.read_csv(path2)
|
| 21 |
+
df2 = pd.read_csv(path3)
|
| 22 |
+
df = pd.concat([df0, df1, df2],axis=0)
|
| 23 |
+
df = df.drop_duplicates("title")
|
| 24 |
+
df = df.reset_index(drop=True)
|
| 25 |
+
df = df.drop("index", axis = 1)
|
| 26 |
+
title = df["title"]
|
| 27 |
+
sentences = [query] + list(title)
|
| 28 |
+
embeddings = sentence_model.encode(sentences)
|
| 29 |
+
similarities = sentence_model.similarity(embeddings[0], embeddings)
|
| 30 |
+
print(len(similarities[0]))
|
| 31 |
+
df["similarity"] = similarities[0][1:]
|
| 32 |
+
df = df.sort_values(by='similarity', ascending=False)
|
| 33 |
+
df = df.reset_index(drop=True)
|
| 34 |
+
df = df.head(30)
|
| 35 |
+
df = df.sort_values(by=['comment_count','votes_count'], ascending=False)
|
| 36 |
+
df = df.reset_index(drop=True)
|
| 37 |
+
df = df.head(18)
|
| 38 |
+
return df
|
venv/reddit/scraping.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
'''
|
| 2 |
+
Only Scraping related code.
|
| 3 |
+
'''
|
| 4 |
+
from selenium import webdriver
|
| 5 |
+
from selenium.webdriver.common.action_chains import ActionChains
|
| 6 |
+
from selenium.webdriver.common.by import By
|
| 7 |
+
from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
| 8 |
+
import time
|
| 9 |
+
from fake_headers import Headers
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import praw
|
| 12 |
+
import re
|
| 13 |
+
|
| 14 |
+
from reddit.reddit_utils import topic_sort
|
| 15 |
+
# # Set up WebDriver
|
| 16 |
+
header = Headers().generate()["User-Agent"]
|
| 17 |
+
proxy=None
|
| 18 |
+
browser_option = FirefoxOptions()
|
| 19 |
+
browser_option.add_argument("--no-sandbox")
|
| 20 |
+
browser_option.add_argument("--disable-dev-shm-usage")
|
| 21 |
+
browser_option.add_argument("--ignore-certificate-errors")
|
| 22 |
+
browser_option.add_argument("--disable-gpu")
|
| 23 |
+
browser_option.add_argument("--log-level=3")
|
| 24 |
+
browser_option.add_argument("--disable-notifications")
|
| 25 |
+
browser_option.add_argument("--disable-popup-blocking")
|
| 26 |
+
browser_option.add_argument("--user-agent={}".format(header))
|
| 27 |
+
if proxy is not None:
|
| 28 |
+
browser_option.add_argument("--proxy-server=%s" % proxy)
|
| 29 |
+
|
| 30 |
+
# For Hiding Browser
|
| 31 |
+
browser_option.add_argument("--headless")
|
| 32 |
+
|
| 33 |
+
driver = webdriver.Firefox(options=browser_option)
|
| 34 |
+
actions = ActionChains(driver)
|
| 35 |
+
reddit = praw.Reddit(
|
| 36 |
+
client_id="yjGfys3QZPpdCpNZl25Kig",
|
| 37 |
+
client_secret="dqoc8LrQBQhB_IgjV-lKyrD9lBPftg",
|
| 38 |
+
password="&honeyB90",
|
| 39 |
+
user_agent="Curious",
|
| 40 |
+
username="Final-Difference7055",
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def extract_post_id(url):
|
| 45 |
+
# Use regex to match the post ID pattern
|
| 46 |
+
match = re.search(r'/comments/([^/]+)/', url)
|
| 47 |
+
if match:
|
| 48 |
+
return match.group(1)
|
| 49 |
+
return None
|
| 50 |
+
# 1. Get Search Post Data
|
| 51 |
+
def getSearchPostData( search_keyword,index, name="",forCompetitorAnalysis=False):
|
| 52 |
+
# Navigate to the search results page
|
| 53 |
+
|
| 54 |
+
url = f'https://www.reddit.com/search/?q={search_keyword}'
|
| 55 |
+
driver.get(url)
|
| 56 |
+
time.sleep(3)
|
| 57 |
+
print("reached this step")
|
| 58 |
+
|
| 59 |
+
# Scroll and extract data
|
| 60 |
+
posts_data = [
|
| 61 |
+
# {
|
| 62 |
+
# "index":0,
|
| 63 |
+
# "title":"",
|
| 64 |
+
# "description":"",
|
| 65 |
+
# "comment_count":0,
|
| 66 |
+
# "votes_count":0,
|
| 67 |
+
# "url":"",
|
| 68 |
+
# "time":""
|
| 69 |
+
# }
|
| 70 |
+
]
|
| 71 |
+
list_length=0 # posts count
|
| 72 |
+
# Scroll down and wait for content to load
|
| 73 |
+
try:
|
| 74 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 75 |
+
time.sleep(5)
|
| 76 |
+
if forCompetitorAnalysis:
|
| 77 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 78 |
+
time.sleep(5)
|
| 79 |
+
# Find post cards
|
| 80 |
+
post_cards = driver.find_elements(By.CSS_SELECTOR, 'a[data-testid="post-title-text"]')
|
| 81 |
+
post_cards_1 = driver.find_elements(By.CSS_SELECTOR, 'div[data-testid="search-counter-row"]')
|
| 82 |
+
post_cards_2 = driver.find_elements(By.CSS_SELECTOR, 'faceplate-timeago')
|
| 83 |
+
idx=list_length
|
| 84 |
+
# Loop through each post card and extract required information
|
| 85 |
+
for card in post_cards_1:
|
| 86 |
+
try:
|
| 87 |
+
# Extract votes count
|
| 88 |
+
votes_count = card.find_element(By.XPATH, './/faceplate-number').text
|
| 89 |
+
|
| 90 |
+
# Extract the comments count, checking for both "comment" and "comments"
|
| 91 |
+
comments_count = card.find_element(By.XPATH,
|
| 92 |
+
'.//span[contains(text(), "comment") or contains(text(), "comments")]/preceding-sibling::faceplate-number'
|
| 93 |
+
).text
|
| 94 |
+
posts_data.append({
|
| 95 |
+
"index":idx,
|
| 96 |
+
"comment_count":comments_count,
|
| 97 |
+
"votes_count":votes_count
|
| 98 |
+
})
|
| 99 |
+
idx+=1
|
| 100 |
+
except Exception as e:
|
| 101 |
+
print("Error in post_card_1:", e)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
idx=list_length
|
| 105 |
+
for card in post_cards:
|
| 106 |
+
try:
|
| 107 |
+
url=card.get_attribute("href")
|
| 108 |
+
title=card.text
|
| 109 |
+
posts_data[idx]["title"]=title
|
| 110 |
+
posts_data[idx]["url"]=url
|
| 111 |
+
idx+=1
|
| 112 |
+
except Exception as e:
|
| 113 |
+
print("Error in post_cards:", e)
|
| 114 |
+
idx=list_length
|
| 115 |
+
|
| 116 |
+
for card in post_cards_2:
|
| 117 |
+
try:
|
| 118 |
+
time_element = card.find_element(By.XPATH,'./time')
|
| 119 |
+
post_time=time_element.get_attribute('datetime')
|
| 120 |
+
posts_data[idx]["time"]=post_time
|
| 121 |
+
idx+=1
|
| 122 |
+
except Exception as e:
|
| 123 |
+
print("Error in post_cards_2:", e)
|
| 124 |
+
except Exception as e:
|
| 125 |
+
print("Error in scrolling:", e)
|
| 126 |
+
|
| 127 |
+
df = pd.DataFrame(posts_data)
|
| 128 |
+
if name!="":
|
| 129 |
+
df = df.drop_duplicates("title")
|
| 130 |
+
df = df[df["title"].str.contains(name)]
|
| 131 |
+
if len(df) >=6:
|
| 132 |
+
df.to_csv(f'posts_data_{index}.csv', index=False)
|
| 133 |
+
if len(df)>18:
|
| 134 |
+
getFinalDataOfCompetitor(user_query=search_keyword,fileName=f'posts_data_{index}.csv')
|
| 135 |
+
print(f"Data saved to posts_data_{index}.csv")
|
| 136 |
+
return index
|
| 137 |
+
else:
|
| 138 |
+
df.to_csv(f'posts_data_{index}.csv', index=False)
|
| 139 |
+
print(f"Data saved to posts_data_{index}.csv")
|
| 140 |
+
def getFinalDataOfCompetitor(user_query,fileName):
|
| 141 |
+
final_df = topic_sort(path1=fileName,isForCompetitorAnalysis=True,query= user_query)
|
| 142 |
+
final_df.to_csv(fileName, index=False)
|
| 143 |
+
print("getFinalDataOfCompetitor Data saved to ",fileName)
|
| 144 |
+
def getSearchPostDescription(url):
|
| 145 |
+
id=extract_post_id(url)
|
| 146 |
+
# Navigate to the search results page
|
| 147 |
+
driver.get(url)
|
| 148 |
+
time.sleep(0.5)
|
| 149 |
+
|
| 150 |
+
# Scroll and extract data
|
| 151 |
+
description=""
|
| 152 |
+
try:
|
| 153 |
+
post_data = driver.find_element(By.CSS_SELECTOR, f'div[id="t3_{id}-post-rtjson-content"]')
|
| 154 |
+
description = post_data.find_element(By.XPATH, './p').text
|
| 155 |
+
except:
|
| 156 |
+
description=""
|
| 157 |
+
return description
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def process_comment(comment,reply_limit):
|
| 163 |
+
# Prepare the comment data
|
| 164 |
+
comment_data = {
|
| 165 |
+
"user": comment.author.name if comment.author else "Unknown",
|
| 166 |
+
"comment": comment.body,
|
| 167 |
+
"replies": [] # Initialize replies list
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
# Process replies recursively if any
|
| 171 |
+
if comment.replies:
|
| 172 |
+
for reply in comment.replies:
|
| 173 |
+
if reply_limit==0:
|
| 174 |
+
break
|
| 175 |
+
reply_data = process_comment(reply,reply_limit=reply_limit-1) # Recursive call for replies
|
| 176 |
+
comment_data["replies"].append(reply_data)
|
| 177 |
+
|
| 178 |
+
return comment_data
|
| 179 |
+
|
| 180 |
+
# 3. get post comments data
|
| 181 |
+
def getPostComments(fileName,isForCompetitorAnalysis=False):
|
| 182 |
+
data= pd.DataFrame(pd.read_csv(fileName))
|
| 183 |
+
data["comments"]=""
|
| 184 |
+
for i in range(len(data)):
|
| 185 |
+
# comment_data_sub=[]
|
| 186 |
+
try:
|
| 187 |
+
submission = reddit.submission(url=data.iloc[i]['url'])
|
| 188 |
+
|
| 189 |
+
# Fetch and process comments
|
| 190 |
+
submission.comments.replace_more(limit=2) # Use limit=0 to get all comments
|
| 191 |
+
except Exception as e:
|
| 192 |
+
print("skipping due to error",data.iloc[i]['url'], e)
|
| 193 |
+
continue
|
| 194 |
+
comments_data = []
|
| 195 |
+
|
| 196 |
+
# Function to process a comment and its replies
|
| 197 |
+
# Seed with top-level comments
|
| 198 |
+
comment_queue = list(submission.comments)
|
| 199 |
+
comment_count=0
|
| 200 |
+
threshold=20 if isForCompetitorAnalysis else 40
|
| 201 |
+
while comment_queue:
|
| 202 |
+
if(comment_count>=threshold):
|
| 203 |
+
break
|
| 204 |
+
comment = comment_queue.pop(0)
|
| 205 |
+
comment_data = process_comment(comment,reply_limit=2 if isForCompetitorAnalysis else 3) # Process each comment
|
| 206 |
+
comments_data.append(comment_data)
|
| 207 |
+
comment_count+=1
|
| 208 |
+
# Now, structure the data into the desired JSON format
|
| 209 |
+
json_output = {
|
| 210 |
+
"comments": comments_data
|
| 211 |
+
}
|
| 212 |
+
subset_data = data.iloc[i].copy()
|
| 213 |
+
|
| 214 |
+
# Modify the subset
|
| 215 |
+
subset_data['comments'] = json_output
|
| 216 |
+
|
| 217 |
+
# Assign back if needed
|
| 218 |
+
data.iloc[i] = subset_data
|
| 219 |
+
# Remove rows where 'comments' is an empty string
|
| 220 |
+
data = data[data['comments'] != ""]
|
| 221 |
+
data["descriptions"] = data["url"].apply(getSearchPostDescription)
|
| 222 |
+
data.to_csv(fileName, index=False)
|
| 223 |
+
print("Data saved to",fileName)
|
venv/test.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# # import pandas as pd
|
| 2 |
+
# # import json
|
| 3 |
+
# # import ast
|
| 4 |
+
# # # Load the CSV file
|
| 5 |
+
# # df = pd.read_csv('file_with_sentiment.csv')
|
| 6 |
+
|
| 7 |
+
# # # Convert the 'comments' column to a list
|
| 8 |
+
# # comments_list = []
|
| 9 |
+
# # for i in df['descriptions']:
|
| 10 |
+
# # # json_data= ast.literal_eval(i)
|
| 11 |
+
# # comments_list.append(i)
|
| 12 |
+
|
| 13 |
+
# # # print("comments_list",len())
|
| 14 |
+
# # # Recursive function to count non-empty replies
|
| 15 |
+
# # def count_non_empty_replies(comments):
|
| 16 |
+
# # count = 1
|
| 17 |
+
# # for comment in comments:
|
| 18 |
+
# # if comment.get("replies"):
|
| 19 |
+
# # count += 1 # Increment for this non-empty replies list
|
| 20 |
+
# # count += count_non_empty_replies(comment["replies"]) # Recursively count nested replies
|
| 21 |
+
# # return count
|
| 22 |
+
|
| 23 |
+
# # # Example usage
|
| 24 |
+
# # # total_non_empty_replies = count_non_empty_replies(comments_list[0]['comments'][0]['replies'])
|
| 25 |
+
# # # print("Total non-empty replies:", total_non_empty_replies)
|
| 26 |
+
|
| 27 |
+
# # # Sample data structure
|
| 28 |
+
# # # comments_list = [
|
| 29 |
+
# # # {
|
| 30 |
+
# # # "comments": [
|
| 31 |
+
# # # # Each post contains a list of comments with nested replies
|
| 32 |
+
# # # ]
|
| 33 |
+
# # # }
|
| 34 |
+
# # # # More comments_list here
|
| 35 |
+
# # # ]
|
| 36 |
+
|
| 37 |
+
# # # Recursive function to limit replies in a comment tree
|
| 38 |
+
# # def limit_replies(comment, reply_limit=3):
|
| 39 |
+
# # limited_comment = {
|
| 40 |
+
# # "user": comment["user"],
|
| 41 |
+
# # "comment": comment["comment"],
|
| 42 |
+
# # "replies": []
|
| 43 |
+
# # }
|
| 44 |
+
# # if reply_limit == 0:
|
| 45 |
+
# # return limited_comment
|
| 46 |
+
|
| 47 |
+
# # # Get up to reply_limit replies, recursively applying the function
|
| 48 |
+
# # if "replies" in comment:
|
| 49 |
+
# # for reply in comment["replies"][:reply_limit-1]:
|
| 50 |
+
# # limited_comment["replies"].append(limit_replies(reply, reply_limit-2))
|
| 51 |
+
|
| 52 |
+
# # return limited_comment
|
| 53 |
+
|
| 54 |
+
# # # Function to process each post, extracting 10 comments with limited replies
|
| 55 |
+
# # def get_limited_comments(comments_list, comment_limit=10, reply_limit=7):
|
| 56 |
+
# # limited_comments_list = []
|
| 57 |
+
|
| 58 |
+
# # for post in comments_list:
|
| 59 |
+
# # limited_post = {"comments": []}
|
| 60 |
+
|
| 61 |
+
# # # Get up to comment_limit comments for each post
|
| 62 |
+
# # for comment in post["comments"][:comment_limit]:
|
| 63 |
+
# # limited_comment = limit_replies(comment, reply_limit)
|
| 64 |
+
# # limited_post["comments"].append(limited_comment)
|
| 65 |
+
|
| 66 |
+
# # limited_comments_list.append(limited_post)
|
| 67 |
+
|
| 68 |
+
# # return limited_comments_list
|
| 69 |
+
|
| 70 |
+
# # Example usage
|
| 71 |
+
# # limited_comments_data = get_limited_comments(comments_list)
|
| 72 |
+
# # total_non_empty_replies = count_non_empty_replies(limited_comments_data[0]['comments'][0]['replies'])
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# # Save the list to a JSON file
|
| 76 |
+
# # with open('comments2.json', 'w') as json_file:
|
| 77 |
+
# # json.dump(comments_list, json_file,indent=4)
|
| 78 |
+
|
| 79 |
+
# # # from reddit.scraping import getPostComments
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# # # getPostComments(fileName="posts_data_1732105228633815.csv")
|
| 83 |
+
# # import time
|
| 84 |
+
|
| 85 |
+
# # from reddit.reddit_sentiment_analysis import SentimentAnalysis
|
| 86 |
+
|
| 87 |
+
# # # Create an instance of the SentimentAnalysis class
|
| 88 |
+
# # sentiment_instance = SentimentAnalysis()
|
| 89 |
+
|
| 90 |
+
# # # Record the start time
|
| 91 |
+
# # start_time = time.time()
|
| 92 |
+
|
| 93 |
+
# # # Call the method to generate sentiment and emotion
|
| 94 |
+
# # sentiment_instance.generate_sentiment_and_emotion_from_data(fileName='posts_data.csv')
|
| 95 |
+
|
| 96 |
+
# # # Record the end time
|
| 97 |
+
# # end_time = time.time()
|
| 98 |
+
|
| 99 |
+
# # # Calculate and print the processing time
|
| 100 |
+
# # process_time = end_time - start_time
|
| 101 |
+
# # print(f"Processing time: {process_time:.2f} seconds")
|
| 102 |
+
# # from reddit.reddit_pain_point_analysis import pain_point_analysis
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# # pain_point_analysis(user_query="artificial intelligence applications in skincare and cosmetic industry",fileName="file_with_sentiment.csv")
|
| 106 |
+
|
| 107 |
+
# # import google.generativeai as genai
|
| 108 |
+
# # genai.configure(api_key='AIzaSyBtHE4Bg2ERWsKeGLxGPOSmtZeWRD6nNr0')
|
| 109 |
+
# # model = genai.GenerativeModel("gemini-1.5-flash")
|
| 110 |
+
|
| 111 |
+
# # generation_config = genai.GenerationConfig(response_mime_type="application/json")
|
| 112 |
+
# # response = model.generate_content("skin care ai ", generation_config=generation_config) # Adjust if the library supports async
|
| 113 |
+
# # data = response.text
|
| 114 |
+
# # print(data)
|
| 115 |
+
|
| 116 |
+
# '''
|
| 117 |
+
# Only Scraping related code.
|
| 118 |
+
# '''
|
| 119 |
+
# from selenium import webdriver
|
| 120 |
+
# from selenium.webdriver.common.action_chains import ActionChains
|
| 121 |
+
# from selenium.webdriver.common.by import By
|
| 122 |
+
# from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
| 123 |
+
# import time
|
| 124 |
+
# from fake_headers import Headers
|
| 125 |
+
# import pandas as pd
|
| 126 |
+
# import praw
|
| 127 |
+
# # from reddit_call import sentence_model
|
| 128 |
+
# import re
|
| 129 |
+
# # # Set up WebDriver
|
| 130 |
+
# header = Headers().generate()["User-Agent"]
|
| 131 |
+
# proxy=None
|
| 132 |
+
# browser_option = FirefoxOptions()
|
| 133 |
+
# browser_option.add_argument("--no-sandbox")
|
| 134 |
+
# browser_option.add_argument("--disable-dev-shm-usage")
|
| 135 |
+
# browser_option.add_argument("--ignore-certificate-errors")
|
| 136 |
+
# browser_option.add_argument("--disable-gpu")
|
| 137 |
+
# browser_option.add_argument("--log-level=3")
|
| 138 |
+
# browser_option.add_argument("--disable-notifications")
|
| 139 |
+
# browser_option.add_argument("--disable-popup-blocking")
|
| 140 |
+
# browser_option.add_argument("--user-agent={}".format(header))
|
| 141 |
+
# if proxy is not None:
|
| 142 |
+
# browser_option.add_argument("--proxy-server=%s" % proxy)
|
| 143 |
+
|
| 144 |
+
# # For Hiding Browser
|
| 145 |
+
# browser_option.add_argument("--headless")
|
| 146 |
+
|
| 147 |
+
# driver = webdriver.Firefox(options=browser_option)
|
| 148 |
+
# actions = ActionChains(driver)
|
| 149 |
+
# reddit = praw.Reddit(
|
| 150 |
+
# client_id="yjGfys3QZPpdCpNZl25Kig",
|
| 151 |
+
# client_secret="dqoc8LrQBQhB_IgjV-lKyrD9lBPftg",
|
| 152 |
+
# password="&honeyB90",
|
| 153 |
+
# user_agent="Curious",
|
| 154 |
+
# username="Final-Difference7055",
|
| 155 |
+
# )
|
| 156 |
+
# fileName="posts_data_1732244765294548.csv"
|
| 157 |
+
# data= pd.DataFrame(pd.read_csv(fileName))
|
| 158 |
+
# data["comments"]=""
|
| 159 |
+
# for i in range(len(data)):
|
| 160 |
+
# # comment_data_sub=[]
|
| 161 |
+
# submission = reddit.submission(url=data.iloc[i]['url'])
|
| 162 |
+
|
| 163 |
+
# # Fetch and process comments
|
| 164 |
+
# submission.comments.replace_more(limit=2) # Use limit=0 to get all comments
|
| 165 |
+
# comments_data = []
|
| 166 |
+
|
| 167 |
+
# # Function to process a comment and its replies
|
| 168 |
+
# # Seed with top-level comments
|
| 169 |
+
# comment_queue = list(submission.comments)
|
| 170 |
+
# comment_count=0
|
| 171 |
+
# threshold=20
|
| 172 |
+
# while comment_queue:
|
| 173 |
+
# if(comment_count>=threshold):
|
| 174 |
+
# break
|
| 175 |
+
# comment = comment_queue.pop(0)
|
| 176 |
+
# comment_data = process_comment(comment,reply_limit=2 if isForCompetitorAnalysis else 3) # Process each comment
|
| 177 |
+
# comments_data.append(comment_data)
|
| 178 |
+
# comment_count+=1
|
| 179 |
+
# # Now, structure the data into the desired JSON format
|
| 180 |
+
# json_output = {
|
| 181 |
+
# "comments": comments_data
|
| 182 |
+
# }
|
| 183 |
+
# subset_data = data.iloc[i].copy()
|
| 184 |
+
|
| 185 |
+
# # Modify the subset
|
| 186 |
+
# subset_data['comments'] = json_output
|
| 187 |
+
|
| 188 |
+
# # Assign back if needed
|
| 189 |
+
# data.iloc[i] = subset_data
|
| 190 |
+
# # Remove rows where 'comments' is an empty string
|
| 191 |
+
# data = data[data['comments'] != ""]
|
| 192 |
+
# data["descriptions"] = data["url"].apply(getSearchPostDescription)
|
| 193 |
+
# data.to_csv(fileName, index=False)
|
| 194 |
+
# print("Data saved to",fileName)
|
| 195 |
+
|
| 196 |
+
from reddit.reddit_competitor_analysis import getCompetitorNamesFromReddit
|
| 197 |
+
from reddit.api_keys import api_key,api_key2, api_key3
|
| 198 |
+
import google.generativeai as genai
|
| 199 |
+
genai.configure(api_key=api_key3)
|
| 200 |
+
getCompetitorNamesFromReddit(user_query='AI powered personalized skin care recommendations',isSolo=True,fileName='posts_data_1732244547776200.csv',last_chat_session=None)
|
venv/utils.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from functools import wraps
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
def time_execution(func):
|
| 6 |
+
if asyncio.iscoroutinefunction(func):
|
| 7 |
+
@wraps(func)
|
| 8 |
+
async def async_wrapper(*args, **kwargs):
|
| 9 |
+
start_time = time.time()
|
| 10 |
+
result = await func(*args, **kwargs)
|
| 11 |
+
end_time = time.time()
|
| 12 |
+
execution_time = end_time - start_time
|
| 13 |
+
print(f"Function '{func.__name__}' executed in {execution_time:.4f} seconds")
|
| 14 |
+
return result
|
| 15 |
+
return async_wrapper
|
| 16 |
+
else:
|
| 17 |
+
@wraps(func)
|
| 18 |
+
def sync_wrapper(*args, **kwargs):
|
| 19 |
+
start_time = time.time()
|
| 20 |
+
result = func(*args, **kwargs)
|
| 21 |
+
end_time = time.time()
|
| 22 |
+
execution_time = end_time - start_time
|
| 23 |
+
print(f"Function '{func.__name__}' executed in {execution_time:.4f} seconds")
|
| 24 |
+
return result
|
| 25 |
+
return sync_wrapper
|
| 26 |
+
|