Tanmoy-AI commited on
Commit
b30e65f
·
1 Parent(s): ce90134

added parser in app.py

Browse files
Files changed (3) hide show
  1. app.py +2 -0
  2. create_test_data.py +1 -0
  3. parse_raw_data.py +41 -0
app.py CHANGED
@@ -7,6 +7,7 @@ from src.data_processor import DataProcessor
7
  from src.insights_generator import InsightsGenerator
8
  from src.visualizations import *
9
  from dotenv import load_dotenv
 
10
 
11
  # Load environment variables from .env file
12
  load_dotenv()
@@ -51,6 +52,7 @@ def find_text_column(df):
51
  def load_and_process_data():
52
  DATA_DIR = 'data/uploads'
53
  PERFECTED_DATA_DIR = 'perfected_data'
 
54
  if not os.path.exists(DATA_DIR):
55
  os.makedirs(DATA_DIR)
56
  if not os.path.exists(PERFECTED_DATA_DIR):
 
7
  from src.insights_generator import InsightsGenerator
8
  from src.visualizations import *
9
  from dotenv import load_dotenv
10
+ from parse_raw_data import run_parser
11
 
12
  # Load environment variables from .env file
13
  load_dotenv()
 
52
  def load_and_process_data():
53
  DATA_DIR = 'data/uploads'
54
  PERFECTED_DATA_DIR = 'perfected_data'
55
+ run_parser()
56
  if not os.path.exists(DATA_DIR):
57
  os.makedirs(DATA_DIR)
58
  if not os.path.exists(PERFECTED_DATA_DIR):
create_test_data.py CHANGED
@@ -2,6 +2,7 @@
2
  import pandas as pd
3
  import os
4
 
 
5
  # --- Create directories if they don't exist ---
6
  UPLOAD_DIR = 'data/uploads'
7
  PERFECTED_DIR = 'perfected_data'
 
2
  import pandas as pd
3
  import os
4
 
5
+
6
  # --- Create directories if they don't exist ---
7
  UPLOAD_DIR = 'data/uploads'
8
  PERFECTED_DIR = 'perfected_data'
parse_raw_data.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ import os
4
+
5
+ def run_parser():
6
+ RAW_FILE_PATH = os.path.join('perfected_data', 'raw_posts_to_parse.txt')
7
+ OUTPUT_CSV_PATH = os.path.join('perfected_data', 'all_posts_with_comments.csv')
8
+ os.makedirs('perfected_data', exist_ok=True)
9
+ print(f"--- [PARSER STATUS] --- Starting parser.")
10
+ if not os.path.exists(RAW_FILE_PATH):
11
+ print(f"--- [PARSER STATUS] --- Raw data file not found. Cannot create clean CSV.")
12
+ if not os.path.exists(OUTPUT_CSV_PATH):
13
+ pd.DataFrame(columns=['text', 'link']).to_csv(OUTPUT_CSV_PATH, index=False)
14
+ return
15
+ print(f"--- [PARSER STATUS] --- Reading raw data from '{RAW_FILE_PATH}'...")
16
+ with open(RAW_FILE_PATH, 'r', encoding='utf-8') as f: content = f.read()
17
+ posts = content.split('==================================================')
18
+ all_rows = []
19
+ for post_block in posts:
20
+ if not post_block.strip(): continue
21
+ post_id = re.search(r'Post ID:\s*(\S+)', post_block)
22
+ post_id = post_id.group(1) if post_id else None
23
+ post_text_match = re.search(r'POST:\n(.*?)\nCOMMENTS:', post_block, re.DOTALL)
24
+ if post_text_match:
25
+ post_text = post_text_match.group(1).replace('\n', ' ').strip()
26
+ all_rows.append({'post_id': post_id, 'text': f"POST: {post_text}", 'type': 'post'})
27
+ if 'COMMENTS:' in post_block:
28
+ comments_section = post_block.split('COMMENTS:')[1]
29
+ for line in comments_section.strip().split('\n'):
30
+ if line.strip(): all_rows.append({'post_id': post_id, 'text': line.strip(), 'type': 'comment'})
31
+ if not all_rows:
32
+ print("--- [PARSER STATUS] --- No data parsed."); return
33
+ df = pd.DataFrame(all_rows)
34
+ def create_link(pid):
35
+ if not pid or 'PR_' not in pid: return "https://www.facebook.com"
36
+ actual_id = pid.split('PR_')[1]
37
+ return f"https://www.facebook.com/posts/{actual_id}"
38
+ df['link'] = df['post_id'].apply(create_link)
39
+ df.to_csv(OUTPUT_CSV_PATH, index=False)
40
+ print(f"--- [PARSER STATUS] --- ✅ Successfully created clean CSV with {len(df)} rows.")
41
+ return