omunaman commited on
Commit
06f2cdc
·
verified ·
1 Parent(s): 1f21286

Upload 5 Files

Browse files
Files changed (5) hide show
  1. .env +6 -0
  2. app.py +150 -0
  3. gemini_processor.py +199 -0
  4. reddit_scraper.py +119 -0
  5. requirements.txt +7 -0
.env ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ REDDIT_CLIENT_ID=jqo7Fs3ovAYGi7o3mlbN0Q
2
+ REDDIT_CLIENT_SECRET=9PGZV9RBlgyfsLTFa36s7SlCFwEGGA
3
+ REDDIT_USER_AGENT=reddit-scraper
4
+ GEMINI_API_KEY=AIzaSyAbkjWfB04trACavrWzANhGEWQhZOpGnpc
5
+ GEMINI_API_ENDPOINT=https://api.gemini.example.com/v1/process # Replace with actual endpoint
6
+ SECRET_KEY=your_flask_secret_key # Replace with a strong secret key
app.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ from flask import Flask, render_template, request, Response, redirect, url_for, flash, jsonify
4
+ import os
5
+ import tempfile
6
+ from reddit_scraper import scrape_reddit_user
7
+ from gemini_processor import process_content
8
+ from dotenv import load_dotenv
9
+ import threading
10
+ import uuid
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ app = Flask(__name__)
16
+ app.secret_key = os.getenv("SECRET_KEY") or 'default_secret_key' # Replace with a strong secret key
17
+
18
+ # Global dictionary to track tasks
19
+ tasks = {}
20
+
21
+ def background_task(username, task_id):
22
+ """
23
+ Background task to scrape Reddit data and process it through Gemini API.
24
+ Updates the tasks dictionary with progress.
25
+ """
26
+ try:
27
+ tasks[task_id]['progress'] = 'Scraping Reddit data...'
28
+ scraped_data = scrape_reddit_user(username, task_id, tasks)
29
+ if not scraped_data:
30
+ tasks[task_id]['progress'] = 'Failed to scrape Reddit data.'
31
+ tasks[task_id]['status'] = 'Failed'
32
+ return
33
+
34
+ tasks[task_id]['progress'] = 'Processing data through Gemini API...'
35
+ structured_report_path = process_content(username, scraped_data, task_id, tasks)
36
+ if not structured_report_path or not os.path.exists(structured_report_path):
37
+ tasks[task_id]['progress'] = 'Failed to process data with Gemini API.'
38
+ tasks[task_id]['status'] = 'Failed'
39
+ return
40
+
41
+ tasks[task_id]['progress'] = 'Report generated successfully.'
42
+ tasks[task_id]['status'] = 'Completed'
43
+ tasks[task_id]['report_path'] = structured_report_path
44
+
45
+ except Exception as e:
46
+ print(f"Error in background task: {e}")
47
+ tasks[task_id]['progress'] = 'An unexpected error occurred.'
48
+ tasks[task_id]['status'] = 'Failed'
49
+
50
+ def get_unique_task_id():
51
+ return uuid.uuid4().hex
52
+
53
+ @app.route('/', methods=['GET', 'POST'])
54
+ def index():
55
+ if request.method == 'POST':
56
+ reddit_username = request.form.get('reddit_username', '').strip()
57
+ if not reddit_username:
58
+ flash('Please enter a Reddit username.', 'danger')
59
+ return redirect(url_for('index'))
60
+
61
+ # Generate a unique task ID
62
+ task_id = get_unique_task_id()
63
+ tasks[task_id] = {
64
+ 'progress': 'Task started.',
65
+ 'status': 'In Progress',
66
+ 'report_path': None,
67
+ 'total_posts': 0,
68
+ 'scraped_posts': 0,
69
+ 'total_comments': 0,
70
+ 'scraped_comments': 0
71
+ }
72
+
73
+ # Start background thread
74
+ thread = threading.Thread(target=background_task, args=(reddit_username, task_id))
75
+ thread.start()
76
+
77
+ flash('Your request is being processed. Please wait...', 'info')
78
+ return redirect(url_for('progress_page', task_id=task_id))
79
+
80
+ return render_template('index.html')
81
+
82
+ @app.route('/progress/<task_id>', methods=['GET'])
83
+ def progress_page(task_id):
84
+ """
85
+ Render the progress page with a progress bar.
86
+ """
87
+ if task_id not in tasks:
88
+ flash('Invalid task ID.', 'danger')
89
+ return redirect(url_for('index'))
90
+ return render_template('progress.html', task_id=task_id)
91
+
92
+ @app.route('/status/<task_id>', methods=['GET'])
93
+ def status(task_id):
94
+ """
95
+ Endpoint to get the current status of the task.
96
+ """
97
+ if task_id not in tasks:
98
+ return jsonify({'status': 'Invalid task ID.'}), 404
99
+
100
+ task = tasks[task_id]
101
+ total_posts = task.get('total_posts', 0)
102
+ scraped_posts = task.get('scraped_posts', 0)
103
+ total_comments = task.get('total_comments', 0)
104
+ scraped_comments = task.get('scraped_comments', 0)
105
+
106
+ return jsonify({
107
+ 'status': task.get('status', 'Unknown'),
108
+ 'progress': task.get('progress', ''),
109
+ 'total_posts': total_posts,
110
+ 'scraped_posts': scraped_posts,
111
+ 'total_comments': total_comments,
112
+ 'scraped_comments': scraped_comments
113
+ })
114
+
115
+ @app.route('/download/<task_id>', methods=['GET'])
116
+ def download(task_id):
117
+ """
118
+ Endpoint to download the generated report.
119
+ """
120
+ if task_id not in tasks:
121
+ flash('Invalid task ID.', 'danger')
122
+ return redirect(url_for('index'))
123
+ if tasks[task_id]['status'] != 'Completed':
124
+ flash('Report is not ready yet.', 'warning')
125
+ return redirect(url_for('progress_page', task_id=task_id))
126
+
127
+ report_path = tasks[task_id]['report_path']
128
+ if not report_path or not os.path.exists(report_path):
129
+ flash('Report file not found.', 'danger')
130
+ return redirect(url_for('index'))
131
+
132
+ # Define a generator to stream the file and delete it after sending
133
+ def generate():
134
+ with open(report_path, 'rb') as f:
135
+ while True:
136
+ chunk = f.read(4096)
137
+ if not chunk:
138
+ break
139
+ yield chunk
140
+ # Delete the file after streaming
141
+ os.remove(report_path)
142
+ # Remove the task from the dictionary
143
+ del tasks[task_id]
144
+
145
+ return Response(generate(), mimetype='text/markdown', headers={
146
+ 'Content-Disposition': f'attachment; filename="{os.path.basename(report_path)}"'
147
+ })
148
+
149
+ if __name__ == '__main__':
150
+ app.run(debug=True)
gemini_processor.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # gemini_processor.py
2
+
3
+ import os
4
+ import time
5
+ import google.generativeai as genai
6
+ from dotenv import load_dotenv
7
+ import uuid
8
+ import tempfile
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ # Configure Gemini API
14
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
15
+
16
+ def upload_to_gemini(path, mime_type=None):
17
+ """
18
+ Upload a file to Gemini API.
19
+ """
20
+ try:
21
+ file = genai.upload_file(path, mime_type=mime_type)
22
+ print(f"Uploaded file '{file.display_name}' as: {file.uri}")
23
+ return file
24
+ except Exception as e:
25
+ print(f"Error uploading file to Gemini API: {e}")
26
+ return None
27
+
28
+ def wait_for_files_active(files):
29
+ """
30
+ Wait until all uploaded files are in ACTIVE state.
31
+ """
32
+ print("Waiting for file processing...")
33
+ for name in (file.name for file in files):
34
+ file = genai.get_file(name)
35
+ while file.state.name == "PROCESSING":
36
+ print(".", end="", flush=True)
37
+ time.sleep(10)
38
+ file = genai.get_file(name)
39
+ if file.state.name != "ACTIVE":
40
+ raise Exception(f"File {file.name} failed to process")
41
+ print("...all files ready\n")
42
+
43
+ def process_content(username, content, task_id, tasks):
44
+ """
45
+ Process the scraped content through Gemini API and update the tasks dict with progress.
46
+ """
47
+ try:
48
+ generation_config = {
49
+ "temperature": 1,
50
+ "top_p": 0.95,
51
+ "top_k": 64,
52
+ "max_output_tokens": 8192,
53
+ "response_mime_type": "text/plain",
54
+ }
55
+
56
+ tasks[task_id]['progress'] = 'Initializing Gemini model...'
57
+ model = genai.GenerativeModel(
58
+ model_name="gemini-exp-1206", # Replace with actual model name if different
59
+ generation_config=generation_config,
60
+ )
61
+
62
+ # Create a unique temporary file
63
+ temp_dir = tempfile.gettempdir()
64
+ unique_id = uuid.uuid4().hex
65
+ temp_input_file = os.path.join(temp_dir, f"{username}_{unique_id}_reddit_full_data.md")
66
+
67
+ # Write content to the temporary input file
68
+ with open(temp_input_file, "w", encoding="utf-8") as f:
69
+ f.write(content)
70
+
71
+ tasks[task_id]['progress'] = 'Uploading file to Gemini API...'
72
+ # Upload the file
73
+ uploaded_file = upload_to_gemini(temp_input_file, mime_type="text/markdown")
74
+ if not uploaded_file:
75
+ tasks[task_id]['status'] = 'Failed'
76
+ tasks[task_id]['progress'] = 'Failed to upload file to Gemini API.'
77
+ os.remove(temp_input_file) # Clean up
78
+ return None
79
+
80
+ tasks[task_id]['progress'] = 'Waiting for Gemini to process the file...'
81
+ # Wait for the file to be active
82
+ wait_for_files_active([uploaded_file])
83
+
84
+ tasks[task_id]['progress'] = 'Generating analysis report...'
85
+ # Start chat session with the designed prompt
86
+ chat_session = model.start_chat(
87
+ history=[
88
+ {
89
+ "role": "user",
90
+ "parts": [
91
+ uploaded_file,
92
+ """You are an advanced AI linguist, psychologist, and behavior analyst trained to analyze digital personas. The attached file contains publicly scraped data of a Reddit account, including their posts and comments. Your task is to create a highly detailed and objective report analyzing the personality, behavior, and potential real-life characteristics of the individual behind this account. Be thorough, no sugarcoating, and support every conclusion with evidence from their posts or comments. You have to be in detail as much as possible breakdown everything. The analysis should be structured as follows:
93
+
94
+ ### 1. **General Overview**
95
+ - Summarize their overall Reddit activity.
96
+ - Identify the primary subreddits they engage with and their interaction patterns.
97
+ - Highlight any notable quirks or unique behaviors.
98
+
99
+ ### 2. **Personality Traits**
100
+ - Writing Style:
101
+ - Do they use a lot of slang, swear words, or formal language?
102
+ - Are they concise or verbose? How articulate are they?
103
+ - Emotional Tone:
104
+ - Do they appear sarcastic, angry, empathetic, or neutral or what?
105
+ - Identify recurring emotional patterns (e.g., consistent frustration, humor, kindness, etc).
106
+ - Recurring Themes:
107
+ - What topics are they obsessed with (e.g., tech, politics, cats)?
108
+ - Any peculiar or niche interests that stand out?
109
+
110
+ ### 3. **Behavioral Red Flags**
111
+ - Problematic Behavior:
112
+ - Are there indications of toxic traits (e.g., misogyny, racism, trolling etc)?
113
+ - Provide evidence from specific posts/comments.
114
+ - Controversial Topics:
115
+ - Have they engaged in heated debates or controversial discussions? If so, which ones?
116
+ - Ethical Concerns:
117
+ - Any signs of stalking, harassment, or unethical behavior? Cite examples.
118
+
119
+ ### 4. **Psychological Insights**
120
+ - Infer potential personality disorders or quirks based on their patterns (e.g., narcissism, obsessive tendencies, etc).
121
+ - Are there signs of insecurity, overconfidence, or attention-seeking behavior or any other similar?
122
+ - Any traits that suggest leadership qualities, creativity, or empathy?
123
+
124
+ ### 5. **Social Dynamics**
125
+ - Interaction Style:
126
+ - Do they seek validation? Argue a lot? Or mostly observe?
127
+ - How do they respond to criticism—defensive, open-minded, dismissive?
128
+ - Relationship Indicators:
129
+ - Can you infer how they might interact with friends, colleagues, or family based on their tone and topics?
130
+
131
+ ### 6. **Real-Life Details (Deep Dive)**
132
+ - **Personal Information Extraction**:
133
+ - Extract any real-life details the user may have inadvertently shared (e.g., full name, location, city, state, country).
134
+ - Did they mention where they live or any specific places related to them (e.g., city, neighborhood)?
135
+ - **Family and Relationships**:
136
+ - If the user shared any information about their family (e.g., parents, siblings, children), include it.
137
+ - Look for any references to close relationships or social groups (e.g., friends, colleagues, romantic partners).
138
+ - Note if they referenced any personal struggles, relationships with family, or any other intimate details they’ve discussed.
139
+ - **Detailed Analysis of Real-Life Connections**
140
+ - Does the person mention any specific events or people in their personal life? (E.g., family holidays, relationships, problems with peers, etc.)
141
+ - What can be inferred about their social circles or living environment based on the information shared?
142
+
143
+ ### 7. **Judgment and Prediction**
144
+ - Is this person likely a positive or negative influence in real life? Why?
145
+ - What kind of individual might they be in real-world settings (e.g., introvert, extrovert, leader, loner)?
146
+ - Predict their personality in real life with evidence-backed reasoning.
147
+
148
+ ### 8. **Detailed Proofs**
149
+ - For every conclusion you make, cite specific posts, comments, or patterns from the data. Use quotes or direct references for clarity.
150
+ - Example:
151
+ - "The user exhibits signs of trolling. In [this comment](https://reddit.com/comment_id), they mocked someone’s opinion without adding value."
152
+ - "Evidence of recurring sarcasm: 'Yeah, sure, because *that’s* going to solve the world’s problems' [Post in r/sarcasm]."
153
+ - "Signs of toxic masculinity in [this post](https://reddit.com/post_id): 'Women these days just want...'"
154
+
155
+ ### 9. **Report Structure**
156
+ - **Concise Headings:** Use bullet points, headers, and sub-headers for readability.
157
+ - **Language Style:** Be sharp, direct, and unapologetic, as if preparing a psychological profile for an investigation.
158
+ - **Tone:** Maintain professionalism, but don’t shy away from brutally honest insights.
159
+
160
+ ### Example Outputs:
161
+ - *"Bro, you're essentially Reddit's poster child for trolling. Here’s the proof: [links to comments]. Your obsession with debating flat-earthers in r/science suggests an inferiority complex and a need to assert intellectual dominance."*
162
+ - *"Based on [this post](https://reddit.com/post_id) in r/MGTOW, your comments reveal a pattern of misogynistic tendencies and anger issues. This is consistent across multiple threads."*
163
+ - *"You’ve replied 'LOL cringe' to 37 people in r/memes. This indicates dismissive behavior and likely a lack of constructive engagement in real life."*
164
+
165
+ Finally, ensure your report is brutally honest, free of bias, and as comprehensive as possible."""
166
+ ],},
167
+ {
168
+ "role": "model",
169
+ "parts": [
170
+ "Yes, I will do it.",
171
+ ],
172
+ },
173
+ ]
174
+ )
175
+
176
+ try:
177
+ response = chat_session.send_message("Yes Do IT!!!!")
178
+ except Exception as e:
179
+ print(f"Error during chat session: {e}")
180
+ os.remove(temp_input_file) # Clean up
181
+ tasks[task_id]['progress'] = 'Failed during Gemini processing.'
182
+ tasks[task_id]['status'] = 'Failed'
183
+ return None
184
+
185
+ # Save the response to a unique .md file
186
+ unique_id = uuid.uuid4().hex
187
+ output_filename = f"response_output_{username}_{unique_id}.md"
188
+ output_path = os.path.join(temp_dir, output_filename)
189
+ with open(output_path, "w", encoding="utf-8") as f:
190
+ f.write(response.text)
191
+
192
+ print(f"Response saved to {output_path}")
193
+ tasks[task_id]['progress'] = 'Report generated successfully.'
194
+ tasks[task_id]['status'] = 'Completed'
195
+ tasks[task_id]['report_path'] = output_path
196
+ return output_path
197
+
198
+ except Exception as e:
199
+ pass
reddit_scraper.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # reddit_scraper.py
2
+
3
+ import praw
4
+ from prawcore.exceptions import RequestException, ServerError, ResponseException, Forbidden
5
+ import os
6
+ import time
7
+ from dotenv import load_dotenv
8
+
9
+ # Load environment variables
10
+ load_dotenv()
11
+
12
+ # Initialize Reddit instance
13
+ reddit = praw.Reddit(
14
+ client_id=os.getenv("REDDIT_CLIENT_ID"),
15
+ client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
16
+ user_agent=os.getenv("REDDIT_USER_AGENT")
17
+ )
18
+
19
+ def wait_and_retry(func, *args, retries=5, backoff_factor=2, **kwargs):
20
+ """
21
+ Retry a function if a rate limit or server error occurs.
22
+ """
23
+ attempt = 0
24
+ while attempt < retries:
25
+ try:
26
+ return func(*args, **kwargs)
27
+ except (RequestException, ServerError, ResponseException) as e:
28
+ attempt += 1
29
+ wait_time = backoff_factor ** attempt
30
+ print(f"Error: {e}. Retrying in {wait_time} seconds...")
31
+ time.sleep(wait_time)
32
+ except Forbidden:
33
+ print("Access forbidden. Skipping...")
34
+ return None
35
+ print(f"Failed after {retries} attempts.")
36
+ return None
37
+
38
+ def scrape_reddit_user(username, task_id, tasks):
39
+ """
40
+ Scrape Reddit user data and update the tasks dict with progress.
41
+ """
42
+ output_data = ""
43
+ try:
44
+ tasks[task_id]['progress'] = 'Fetching user information...'
45
+ # Get user object
46
+ user = wait_and_retry(reddit.redditor, username)
47
+ if not user:
48
+ print(f"Unable to fetch data for user: {username}")
49
+ tasks[task_id]['progress'] = 'Failed to fetch user data.'
50
+ tasks[task_id]['status'] = 'Failed'
51
+ return None
52
+
53
+ output_data += f"# Reddit User: {username}\n\n## 📝 Posts:\n\n"
54
+
55
+ # Count posts and comments
56
+ tasks[task_id]['progress'] = 'Counting total posts and comments...'
57
+ total_posts = wait_and_retry(lambda: sum(1 for _ in user.submissions.new(limit=None)))
58
+ total_comments = wait_and_retry(lambda: sum(1 for _ in user.comments.new(limit=None)))
59
+ tasks[task_id]['total_posts'] = total_posts
60
+ tasks[task_id]['total_comments'] = total_comments
61
+ tasks[task_id]['progress'] = f"Total Posts: {total_posts}, Total Comments: {total_comments}\n"
62
+
63
+ # Initialize scraped counts
64
+ tasks[task_id]['scraped_posts'] = 0
65
+ tasks[task_id]['scraped_comments'] = 0
66
+
67
+ # Scrape posts
68
+ tasks[task_id]['progress'] = 'Scraping posts...'
69
+ submissions = wait_and_retry(user.submissions.new, limit=None)
70
+ if submissions:
71
+ for post in submissions:
72
+ try:
73
+ post_data = (
74
+ f"### Title: {post.title}\n"
75
+ f"**Subreddit:** {post.subreddit}\n"
76
+ f"**URL:** {post.url}\n"
77
+ f"**Content:** {post.selftext or 'No Content'}\n\n"
78
+ )
79
+ output_data += post_data
80
+ tasks[task_id]['scraped_posts'] += 1
81
+ tasks[task_id]['progress'] = f"Scraping posts... ({tasks[task_id]['scraped_posts']}/{tasks[task_id]['total_posts']})"
82
+ except Exception as post_error:
83
+ print(f"Error with post: {post_error}")
84
+
85
+ # Add section for comments
86
+ output_data += "\n## 💬 Comments:\n\n"
87
+
88
+ # Scrape comments
89
+ tasks[task_id]['progress'] = 'Scraping comments...'
90
+ comments = wait_and_retry(user.comments.new, limit=None)
91
+ if comments:
92
+ for comment in comments:
93
+ try:
94
+ comment_data = (
95
+ f"### Comment:\n{comment.body}\n"
96
+ f"**Subreddit:** {comment.subreddit}\n"
97
+ f"**Post:** {comment.submission.title}\n"
98
+ )
99
+
100
+ # Add parent comment if replying
101
+ if not comment.is_root:
102
+ parent_comment = wait_and_retry(comment.parent)
103
+ if isinstance(parent_comment, praw.models.Comment):
104
+ comment_data += f"**Parent Comment:** {parent_comment.body}\n"
105
+
106
+ comment_data += "\n"
107
+ output_data += comment_data
108
+ tasks[task_id]['scraped_comments'] += 1
109
+ tasks[task_id]['progress'] = f"Scraping comments... ({tasks[task_id]['scraped_comments']}/{tasks[task_id]['total_comments']})"
110
+ except Exception as comment_error:
111
+ print(f"Error with comment: {comment_error}")
112
+
113
+ print("\nScraping completed!")
114
+ tasks[task_id]['progress'] = 'Scraping completed. Processing data...'
115
+ tasks[task_id]['status'] = 'Processing'
116
+ return output_data
117
+
118
+ except:
119
+ pass
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Flask
2
+ google-generativeai
3
+ gunicorn
4
+ praw
5
+ python-dotenv
6
+ requests
7
+ tqdm