Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import re | |
| import os | |
| def run_parser(): | |
| RAW_FILE_PATH = os.path.join('perfected_data', 'raw_posts_to_parse.txt') | |
| OUTPUT_CSV_PATH = os.path.join('perfected_data', 'all_posts_with_comments.csv') | |
| os.makedirs('perfected_data', exist_ok=True) | |
| print(f"--- [PARSER STATUS] --- Starting parser.") | |
| if not os.path.exists(RAW_FILE_PATH): | |
| print(f"--- [PARSER STATUS] --- Raw data file not found. Cannot create clean CSV.") | |
| if not os.path.exists(OUTPUT_CSV_PATH): | |
| pd.DataFrame(columns=['text', 'link']).to_csv(OUTPUT_CSV_PATH, index=False) | |
| return | |
| print(f"--- [PARSER STATUS] --- Reading raw data from '{RAW_FILE_PATH}'...") | |
| with open(RAW_FILE_PATH, 'r', encoding='utf-8') as f: content = f.read() | |
| posts = content.split('==================================================') | |
| all_rows = [] | |
| for post_block in posts: | |
| if not post_block.strip(): continue | |
| post_id = re.search(r'Post ID:\s*(\S+)', post_block) | |
| post_id = post_id.group(1) if post_id else None | |
| post_text_match = re.search(r'POST:\n(.*?)\nCOMMENTS:', post_block, re.DOTALL) | |
| if post_text_match: | |
| post_text = post_text_match.group(1).replace('\n', ' ').strip() | |
| all_rows.append({'post_id': post_id, 'text': f"POST: {post_text}", 'type': 'post'}) | |
| if 'COMMENTS:' in post_block: | |
| comments_section = post_block.split('COMMENTS:')[1] | |
| for line in comments_section.strip().split('\n'): | |
| if line.strip(): all_rows.append({'post_id': post_id, 'text': line.strip(), 'type': 'comment'}) | |
| if not all_rows: | |
| print("--- [PARSER STATUS] --- No data parsed."); return | |
| df = pd.DataFrame(all_rows) | |
| def create_link(pid): | |
| if not pid or 'PR_' not in pid: return "https://www.facebook.com" | |
| actual_id = pid.split('PR_')[1] | |
| return f"https://www.facebook.com/posts/{actual_id}" | |
| df['link'] = df['post_id'].apply(create_link) | |
| df.to_csv(OUTPUT_CSV_PATH, index=False) | |
| print(f"--- [PARSER STATUS] --- ✅ Successfully created clean CSV with {len(df)} rows.") | |
| return |