Spaces:

omunaman
/

reddit-user-data-analysis

Running

App Files Files Community

omunaman commited on Jan 1, 2025

Commit

06f2cdc

verified ·

1 Parent(s): 1f21286

Upload 5 Files

Browse files

Files changed (5) hide show

.env +6 -0
app.py +150 -0
gemini_processor.py +199 -0
reddit_scraper.py +119 -0
requirements.txt +7 -0

.env ADDED Viewed

	@@ -0,0 +1,6 @@

+REDDIT_CLIENT_ID=jqo7Fs3ovAYGi7o3mlbN0Q
+REDDIT_CLIENT_SECRET=9PGZV9RBlgyfsLTFa36s7SlCFwEGGA
+REDDIT_USER_AGENT=reddit-scraper
+GEMINI_API_KEY=AIzaSyAbkjWfB04trACavrWzANhGEWQhZOpGnpc
+GEMINI_API_ENDPOINT=https://api.gemini.example.com/v1/process  # Replace with actual endpoint
+SECRET_KEY=your_flask_secret_key  # Replace with a strong secret key

app.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# app.py
+from flask import Flask, render_template, request, Response, redirect, url_for, flash, jsonify
+import os
+import tempfile
+from reddit_scraper import scrape_reddit_user
+from gemini_processor import process_content
+from dotenv import load_dotenv
+import threading
+import uuid
+# Load environment variables
+load_dotenv()
+app = Flask(__name__)
+app.secret_key = os.getenv("SECRET_KEY") or 'default_secret_key'  # Replace with a strong secret key
+# Global dictionary to track tasks
+tasks = {}
+def background_task(username, task_id):
+    """
+    Background task to scrape Reddit data and process it through Gemini API.
+    Updates the tasks dictionary with progress.
+    """
+    try:
+        tasks[task_id]['progress'] = 'Scraping Reddit data...'
+        scraped_data = scrape_reddit_user(username, task_id, tasks)
+        if not scraped_data:
+            tasks[task_id]['progress'] = 'Failed to scrape Reddit data.'
+            tasks[task_id]['status'] = 'Failed'
+            return
+        tasks[task_id]['progress'] = 'Processing data through Gemini API...'
+        structured_report_path = process_content(username, scraped_data, task_id, tasks)
+        if not structured_report_path or not os.path.exists(structured_report_path):
+            tasks[task_id]['progress'] = 'Failed to process data with Gemini API.'
+            tasks[task_id]['status'] = 'Failed'
+            return
+        tasks[task_id]['progress'] = 'Report generated successfully.'
+        tasks[task_id]['status'] = 'Completed'
+        tasks[task_id]['report_path'] = structured_report_path
+    except Exception as e:
+        print(f"Error in background task: {e}")
+        tasks[task_id]['progress'] = 'An unexpected error occurred.'
+        tasks[task_id]['status'] = 'Failed'
+def get_unique_task_id():
+    return uuid.uuid4().hex
+@app.route('/', methods=['GET', 'POST'])
+def index():
+    if request.method == 'POST':
+        reddit_username = request.form.get('reddit_username', '').strip()
+        if not reddit_username:
+            flash('Please enter a Reddit username.', 'danger')
+            return redirect(url_for('index'))
+        # Generate a unique task ID
+        task_id = get_unique_task_id()
+        tasks[task_id] = {
+            'progress': 'Task started.',
+            'status': 'In Progress',
+            'report_path': None,
+            'total_posts': 0,
+            'scraped_posts': 0,
+            'total_comments': 0,
+            'scraped_comments': 0
+        }
+        # Start background thread
+        thread = threading.Thread(target=background_task, args=(reddit_username, task_id))
+        thread.start()
+        flash('Your request is being processed. Please wait...', 'info')
+        return redirect(url_for('progress_page', task_id=task_id))
+    return render_template('index.html')
+@app.route('/progress/<task_id>', methods=['GET'])
+def progress_page(task_id):
+    """
+    Render the progress page with a progress bar.
+    """
+    if task_id not in tasks:
+        flash('Invalid task ID.', 'danger')
+        return redirect(url_for('index'))
+    return render_template('progress.html', task_id=task_id)
+@app.route('/status/<task_id>', methods=['GET'])
+def status(task_id):
+    """
+    Endpoint to get the current status of the task.
+    """
+    if task_id not in tasks:
+        return jsonify({'status': 'Invalid task ID.'}), 404
+    task = tasks[task_id]
+    total_posts = task.get('total_posts', 0)
+    scraped_posts = task.get('scraped_posts', 0)
+    total_comments = task.get('total_comments', 0)
+    scraped_comments = task.get('scraped_comments', 0)
+    return jsonify({
+        'status': task.get('status', 'Unknown'),
+        'progress': task.get('progress', ''),
+        'total_posts': total_posts,
+        'scraped_posts': scraped_posts,
+        'total_comments': total_comments,
+        'scraped_comments': scraped_comments
+    })
+@app.route('/download/<task_id>', methods=['GET'])
+def download(task_id):
+    """
+    Endpoint to download the generated report.
+    """
+    if task_id not in tasks:
+        flash('Invalid task ID.', 'danger')
+        return redirect(url_for('index'))
+    if tasks[task_id]['status'] != 'Completed':
+        flash('Report is not ready yet.', 'warning')
+        return redirect(url_for('progress_page', task_id=task_id))
+    report_path = tasks[task_id]['report_path']
+    if not report_path or not os.path.exists(report_path):
+        flash('Report file not found.', 'danger')
+        return redirect(url_for('index'))
+    # Define a generator to stream the file and delete it after sending
+    def generate():
+        with open(report_path, 'rb') as f:
+            while True:
+                chunk = f.read(4096)
+                if not chunk:
+                    break
+                yield chunk
+        # Delete the file after streaming
+        os.remove(report_path)
+        # Remove the task from the dictionary
+        del tasks[task_id]
+    return Response(generate(), mimetype='text/markdown', headers={
+        'Content-Disposition': f'attachment; filename="{os.path.basename(report_path)}"'
+    })
+if __name__ == '__main__':
+    app.run(debug=True)

gemini_processor.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# gemini_processor.py
+import os
+import time
+import google.generativeai as genai
+from dotenv import load_dotenv
+import uuid
+import tempfile
+# Load environment variables
+load_dotenv()
+# Configure Gemini API
+genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+def upload_to_gemini(path, mime_type=None):
+    """
+    Upload a file to Gemini API.
+    """
+    try:
+        file = genai.upload_file(path, mime_type=mime_type)
+        print(f"Uploaded file '{file.display_name}' as: {file.uri}")
+        return file
+    except Exception as e:
+        print(f"Error uploading file to Gemini API: {e}")
+        return None
+def wait_for_files_active(files):
+    """
+    Wait until all uploaded files are in ACTIVE state.
+    """
+    print("Waiting for file processing...")
+    for name in (file.name for file in files):
+        file = genai.get_file(name)
+        while file.state.name == "PROCESSING":
+            print(".", end="", flush=True)
+            time.sleep(10)
+            file = genai.get_file(name)
+        if file.state.name != "ACTIVE":
+            raise Exception(f"File {file.name} failed to process")
+    print("...all files ready\n")
+def process_content(username, content, task_id, tasks):
+    """
+    Process the scraped content through Gemini API and update the tasks dict with progress.
+    """
+    try:
+        generation_config = {
+            "temperature": 1,
+            "top_p": 0.95,
+            "top_k": 64,
+            "max_output_tokens": 8192,
+            "response_mime_type": "text/plain",
+        }
+        tasks[task_id]['progress'] = 'Initializing Gemini model...'
+        model = genai.GenerativeModel(
+            model_name="gemini-exp-1206",  # Replace with actual model name if different
+            generation_config=generation_config,
+        )
+        # Create a unique temporary file
+        temp_dir = tempfile.gettempdir()
+        unique_id = uuid.uuid4().hex
+        temp_input_file = os.path.join(temp_dir, f"{username}_{unique_id}_reddit_full_data.md")
+        # Write content to the temporary input file
+        with open(temp_input_file, "w", encoding="utf-8") as f:
+            f.write(content)
+        tasks[task_id]['progress'] = 'Uploading file to Gemini API...'
+        # Upload the file
+        uploaded_file = upload_to_gemini(temp_input_file, mime_type="text/markdown")
+        if not uploaded_file:
+            tasks[task_id]['status'] = 'Failed'
+            tasks[task_id]['progress'] = 'Failed to upload file to Gemini API.'
+            os.remove(temp_input_file)  # Clean up
+            return None
+        tasks[task_id]['progress'] = 'Waiting for Gemini to process the file...'
+        # Wait for the file to be active
+        wait_for_files_active([uploaded_file])
+        tasks[task_id]['progress'] = 'Generating analysis report...'
+        # Start chat session with the designed prompt
+        chat_session = model.start_chat(
+            history=[
+                {
+                    "role": "user",
+                    "parts": [
+                        uploaded_file,
+                        """You are an advanced AI linguist, psychologist, and behavior analyst trained to analyze digital personas. The attached file contains publicly scraped data of a Reddit account, including their posts and comments. Your task is to create a highly detailed and objective report analyzing the personality, behavior, and potential real-life characteristics of the individual behind this account. Be thorough, no sugarcoating, and support every conclusion with evidence from their posts or comments. You have to be in detail as much as possible breakdown everything. The analysis should be structured as follows:
+### 1. **General Overview**
+   - Summarize their overall Reddit activity.
+   - Identify the primary subreddits they engage with and their interaction patterns.
+   - Highlight any notable quirks or unique behaviors.
+### 2. **Personality Traits**
+   - Writing Style:
+     - Do they use a lot of slang, swear words, or formal language?
+     - Are they concise or verbose? How articulate are they?
+   - Emotional Tone:
+     - Do they appear sarcastic, angry, empathetic, or neutral or what?
+     - Identify recurring emotional patterns (e.g., consistent frustration, humor, kindness, etc).
+   - Recurring Themes:
+     - What topics are they obsessed with (e.g., tech, politics, cats)?
+     - Any peculiar or niche interests that stand out?
+### 3. **Behavioral Red Flags**
+   - Problematic Behavior:
+     - Are there indications of toxic traits (e.g., misogyny, racism, trolling etc)?
+     - Provide evidence from specific posts/comments.
+   - Controversial Topics:
+     - Have they engaged in heated debates or controversial discussions? If so, which ones?
+   - Ethical Concerns:
+     - Any signs of stalking, harassment, or unethical behavior? Cite examples.
+### 4. **Psychological Insights**
+   - Infer potential personality disorders or quirks based on their patterns (e.g., narcissism, obsessive tendencies, etc).
+   - Are there signs of insecurity, overconfidence, or attention-seeking behavior or any other similar?
+   - Any traits that suggest leadership qualities, creativity, or empathy?
+### 5. **Social Dynamics**
+   - Interaction Style:
+     - Do they seek validation? Argue a lot? Or mostly observe?
+     - How do they respond to criticism—defensive, open-minded, dismissive?
+   - Relationship Indicators:
+     - Can you infer how they might interact with friends, colleagues, or family based on their tone and topics?
+### 6. **Real-Life Details (Deep Dive)**
+   - **Personal Information Extraction**:
+     - Extract any real-life details the user may have inadvertently shared (e.g., full name, location, city, state, country).
+     - Did they mention where they live or any specific places related to them (e.g., city, neighborhood)?
+   - **Family and Relationships**:
+     - If the user shared any information about their family (e.g., parents, siblings, children), include it.
+     - Look for any references to close relationships or social groups (e.g., friends, colleagues, romantic partners).
+     - Note if they referenced any personal struggles, relationships with family, or any other intimate details they’ve discussed.
+   - **Detailed Analysis of Real-Life Connections**
+     - Does the person mention any specific events or people in their personal life? (E.g., family holidays, relationships, problems with peers, etc.)
+     - What can be inferred about their social circles or living environment based on the information shared?
+### 7. **Judgment and Prediction**
+   - Is this person likely a positive or negative influence in real life? Why?
+   - What kind of individual might they be in real-world settings (e.g., introvert, extrovert, leader, loner)?
+   - Predict their personality in real life with evidence-backed reasoning.
+### 8. **Detailed Proofs**
+   - For every conclusion you make, cite specific posts, comments, or patterns from the data. Use quotes or direct references for clarity.
+   - Example:
+     - "The user exhibits signs of trolling. In [this comment](https://reddit.com/comment_id), they mocked someone’s opinion without adding value."
+     - "Evidence of recurring sarcasm: 'Yeah, sure, because *that’s* going to solve the world’s problems' [Post in r/sarcasm]."
+     - "Signs of toxic masculinity in [this post](https://reddit.com/post_id): 'Women these days just want...'"
+### 9. **Report Structure**
+   - **Concise Headings:** Use bullet points, headers, and sub-headers for readability.
+   - **Language Style:** Be sharp, direct, and unapologetic, as if preparing a psychological profile for an investigation.
+   - **Tone:** Maintain professionalism, but don’t shy away from brutally honest insights.
+### Example Outputs:
+- *"Bro, you're essentially Reddit's poster child for trolling. Here’s the proof: [links to comments]. Your obsession with debating flat-earthers in r/science suggests an inferiority complex and a need to assert intellectual dominance."*
+- *"Based on [this post](https://reddit.com/post_id) in r/MGTOW, your comments reveal a pattern of misogynistic tendencies and anger issues. This is consistent across multiple threads."*
+- *"You’ve replied 'LOL cringe' to 37 people in r/memes. This indicates dismissive behavior and likely a lack of constructive engagement in real life."*
+Finally, ensure your report is brutally honest, free of bias, and as comprehensive as possible."""
+                    ],},
+                        {
+                        "role": "model",
+                        "parts": [
+                            "Yes, I will do it.",
+                        ],
+                    },
+                ]
+        )
+        try:
+            response = chat_session.send_message("Yes Do IT!!!!")
+        except Exception as e:
+            print(f"Error during chat session: {e}")
+            os.remove(temp_input_file)  # Clean up
+            tasks[task_id]['progress'] = 'Failed during Gemini processing.'
+            tasks[task_id]['status'] = 'Failed'
+            return None
+        # Save the response to a unique .md file
+        unique_id = uuid.uuid4().hex
+        output_filename = f"response_output_{username}_{unique_id}.md"
+        output_path = os.path.join(temp_dir, output_filename)
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(response.text)
+        print(f"Response saved to {output_path}")
+        tasks[task_id]['progress'] = 'Report generated successfully.'
+        tasks[task_id]['status'] = 'Completed'
+        tasks[task_id]['report_path'] = output_path
+        return output_path
+    except Exception as e:
+        pass

reddit_scraper.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# reddit_scraper.py
+import praw
+from prawcore.exceptions import RequestException, ServerError, ResponseException, Forbidden
+import os
+import time
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Initialize Reddit instance
+reddit = praw.Reddit(
+    client_id=os.getenv("REDDIT_CLIENT_ID"),
+    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
+    user_agent=os.getenv("REDDIT_USER_AGENT")
+)
+def wait_and_retry(func, *args, retries=5, backoff_factor=2, **kwargs):
+    """
+    Retry a function if a rate limit or server error occurs.
+    """
+    attempt = 0
+    while attempt < retries:
+        try:
+            return func(*args, **kwargs)
+        except (RequestException, ServerError, ResponseException) as e:
+            attempt += 1
+            wait_time = backoff_factor ** attempt
+            print(f"Error: {e}. Retrying in {wait_time} seconds...")
+            time.sleep(wait_time)
+        except Forbidden:
+            print("Access forbidden. Skipping...")
+            return None
+    print(f"Failed after {retries} attempts.")
+    return None
+def scrape_reddit_user(username, task_id, tasks):
+    """
+    Scrape Reddit user data and update the tasks dict with progress.
+    """
+    output_data = ""
+    try:
+        tasks[task_id]['progress'] = 'Fetching user information...'
+        # Get user object
+        user = wait_and_retry(reddit.redditor, username)
+        if not user:
+            print(f"Unable to fetch data for user: {username}")
+            tasks[task_id]['progress'] = 'Failed to fetch user data.'
+            tasks[task_id]['status'] = 'Failed'
+            return None
+        output_data += f"# Reddit User: {username}\n\n## 📝 Posts:\n\n"
+        # Count posts and comments
+        tasks[task_id]['progress'] = 'Counting total posts and comments...'
+        total_posts = wait_and_retry(lambda: sum(1 for _ in user.submissions.new(limit=None)))
+        total_comments = wait_and_retry(lambda: sum(1 for _ in user.comments.new(limit=None)))
+        tasks[task_id]['total_posts'] = total_posts
+        tasks[task_id]['total_comments'] = total_comments
+        tasks[task_id]['progress'] = f"Total Posts: {total_posts}, Total Comments: {total_comments}\n"
+        # Initialize scraped counts
+        tasks[task_id]['scraped_posts'] = 0
+        tasks[task_id]['scraped_comments'] = 0
+        # Scrape posts
+        tasks[task_id]['progress'] = 'Scraping posts...'
+        submissions = wait_and_retry(user.submissions.new, limit=None)
+        if submissions:
+            for post in submissions:
+                try:
+                    post_data = (
+                        f"### Title: {post.title}\n"
+                        f"**Subreddit:** {post.subreddit}\n"
+                        f"**URL:** {post.url}\n"
+                        f"**Content:** {post.selftext or 'No Content'}\n\n"
+                    )
+                    output_data += post_data
+                    tasks[task_id]['scraped_posts'] += 1
+                    tasks[task_id]['progress'] = f"Scraping posts... ({tasks[task_id]['scraped_posts']}/{tasks[task_id]['total_posts']})"
+                except Exception as post_error:
+                    print(f"Error with post: {post_error}")
+        # Add section for comments
+        output_data += "\n## 💬 Comments:\n\n"
+        # Scrape comments
+        tasks[task_id]['progress'] = 'Scraping comments...'
+        comments = wait_and_retry(user.comments.new, limit=None)
+        if comments:
+            for comment in comments:
+                try:
+                    comment_data = (
+                        f"### Comment:\n{comment.body}\n"
+                        f"**Subreddit:** {comment.subreddit}\n"
+                        f"**Post:** {comment.submission.title}\n"
+                    )
+                    # Add parent comment if replying
+                    if not comment.is_root:
+                        parent_comment = wait_and_retry(comment.parent)
+                        if isinstance(parent_comment, praw.models.Comment):
+                            comment_data += f"**Parent Comment:** {parent_comment.body}\n"
+                    comment_data += "\n"
+                    output_data += comment_data
+                    tasks[task_id]['scraped_comments'] += 1
+                    tasks[task_id]['progress'] = f"Scraping comments... ({tasks[task_id]['scraped_comments']}/{tasks[task_id]['total_comments']})"
+                except Exception as comment_error:
+                    print(f"Error with comment: {comment_error}")
+        print("\nScraping completed!")
+        tasks[task_id]['progress'] = 'Scraping completed. Processing data...'
+        tasks[task_id]['status'] = 'Processing'
+        return output_data
+    except:
+        pass

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+Flask
+google-generativeai
+gunicorn
+praw
+python-dotenv
+requests
+tqdm