Refat81 commited on
Commit
67a4166
Β·
verified Β·
1 Parent(s): e8a2c75

Update pages/facebook_extractor.py

Browse files
Files changed (1) hide show
  1. pages/facebook_extractor.py +335 -180
pages/facebook_extractor.py CHANGED
@@ -23,167 +23,297 @@ st.set_page_config(
23
  layout="wide"
24
  )
25
 
26
- class FacebookPublicExtractor:
27
- """Facebook public data extractor that works on Hugging Face"""
28
 
29
  def __init__(self):
30
- self.session = requests.Session()
31
- self.setup_session()
32
 
33
- def setup_session(self):
34
- """Setup requests session with headers"""
35
- self.session.headers.update({
36
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
37
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
38
- 'Accept-Language': 'en-US,en;q=0.5',
39
- 'Accept-Encoding': 'gzip, deflate, br',
40
- 'DNT': '1',
41
- 'Connection': 'keep-alive',
42
- 'Upgrade-Insecure-Requests': '1',
43
- })
44
-
45
- def extract_public_data(self, url: str, data_type: str) -> Dict:
46
- """Extract public data from Facebook URLs"""
47
  try:
48
- st.info(f"🌐 Accessing: {url}")
49
-
50
- # Validate URL type
51
- url_type = self.analyze_facebook_url(url)
52
-
53
- response = self.session.get(url, timeout=15)
54
-
55
- if response.status_code != 200:
56
- return {
57
- "error": f"Failed to access page (Status: {response.status_code})",
58
- "url_type": url_type,
59
- "status": "error"
60
- }
61
-
62
- soup = BeautifulSoup(response.text, 'html.parser')
63
-
64
- # Remove scripts and styles
65
- for script in soup(["script", "style", "meta", "link"]):
66
- script.decompose()
67
 
68
- # Extract basic information
69
- text = soup.get_text()
70
- lines = (line.strip() for line in text.splitlines())
71
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
72
- clean_text = ' '.join(chunk for chunk in chunks if chunk)
73
 
74
- # Extract structured data
75
- page_info = self._extract_page_info(soup, url, response)
76
- content_blocks = self._extract_content_blocks(clean_text, url_type)
77
-
78
- return {
79
- "page_info": page_info,
80
- "content_blocks": content_blocks,
81
- "url_type": url_type,
82
- "extraction_time": datetime.now().isoformat(),
83
- "data_type": data_type,
84
- "status": "success"
85
- }
86
 
87
  except Exception as e:
88
- return {"error": f"Extraction failed: {str(e)}", "status": "error"}
89
-
90
- def analyze_facebook_url(self, url: str) -> str:
91
- """Analyze Facebook URL type"""
92
- url_lower = url.lower()
93
-
94
- if 'facebook.com/groups/' in url_lower:
95
- return "Facebook Group (Limited public data)"
96
- elif 'facebook.com/pages/' in url_lower or '/pages/' in url_lower:
97
- return "Facebook Page (Public data available)"
98
- elif 'facebook.com/events/' in url_lower:
99
- return "Facebook Event (Limited access)"
100
- elif 'facebook.com/marketplace/' in url_lower:
101
- return "Facebook Marketplace"
102
- elif 'facebook.com/' in url_lower and '/posts/' in url_lower:
103
- return "Facebook Post"
104
- else:
105
- return "Facebook Profile/Page"
106
 
107
- def _extract_page_info(self, soup, url: str, response) -> Dict:
108
- """Extract page information"""
109
- title = soup.find('title')
110
- meta_desc = soup.find('meta', attrs={'name': 'description'})
111
- og_title = soup.find('meta', property='og:title')
112
- og_description = soup.find('meta', property='og:description')
113
- og_image = soup.find('meta', property='og:image')
114
-
115
- return {
116
- "title": title.text.strip() if title else "Facebook Content",
117
- "description": meta_desc['content'] if meta_desc else "",
118
- "og_title": og_title['content'] if og_title else "",
119
- "og_description": og_description['content'] if og_description else "",
120
- "og_image": og_image['content'] if og_image else "",
121
- "url": url,
122
- "response_code": response.status_code,
123
- "content_length": len(response.text),
124
- "access_note": self._get_access_note(soup)
125
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
- def _extract_content_blocks(self, text: str, url_type: str) -> List[Dict]:
128
- """Extract meaningful content blocks"""
129
  blocks = []
 
 
130
 
131
- # Split into paragraphs/sentences
132
- paragraphs = [p.strip() for p in text.split('.') if p.strip() and len(p.strip()) > 20]
133
-
134
- for i, paragraph in enumerate(paragraphs[:15]): # Limit to first 15
135
- content_type = self._analyze_content_type(paragraph, url_type)
136
-
137
- block = {
138
  "id": i + 1,
139
  "content": paragraph,
140
  "length": len(paragraph),
141
  "word_count": len(paragraph.split()),
142
- "content_type": content_type,
143
- "has_links": 'http' in paragraph.lower(),
144
- "is_public_content": self._is_public_content(paragraph)
145
- }
146
- blocks.append(block)
147
 
148
  return blocks
149
 
150
- def _analyze_content_type(self, text: str, url_type: str) -> str:
151
- """Analyze content type"""
152
- text_lower = text.lower()
153
 
154
- if any(word in text_lower for word in ['event', 'date', 'time', 'location']):
155
- return "event_info"
156
- elif any(word in text_lower for word in ['group', 'community', 'member', 'join']):
157
- return "community_info"
158
- elif any(word in text_lower for word in ['marketplace', 'buy', 'sell', 'price']):
159
- return "commerce"
160
- elif any(word in text_lower for word in ['post', 'share', 'comment']):
161
- return "social_content"
162
- elif any(word in text_lower for word in ['login', 'sign in']):
163
- return "authentication"
164
  else:
165
- return "general_content"
166
 
167
- def _is_public_content(self, text: str) -> bool:
168
- """Check if content appears to be publicly accessible"""
169
- text_lower = text.lower()
170
- private_indicators = [
171
- 'log in to see', 'sign up to see', 'you must log in',
172
- 'private content', 'restricted access'
173
- ]
174
 
175
- return not any(indicator in text_lower for indicator in private_indicators)
 
 
 
 
 
 
 
 
 
176
 
177
- def _get_access_note(self, soup) -> str:
178
- """Get access level note"""
179
- page_text = soup.get_text().lower()
180
 
181
- if any(phrase in page_text for phrase in ['log in', 'sign in', 'you must be logged in']):
182
- return "Login required for full access"
183
- elif 'content not available' in page_text:
184
- return "Content not publicly available"
185
- else:
186
- return "Public content accessible"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  # AI Functions (same as your LinkedIn analyzer)
189
  def get_embeddings():
@@ -227,20 +357,27 @@ def process_facebook_data(extracted_data):
227
  page_info = extracted_data['page_info']
228
  content_blocks = extracted_data['content_blocks']
229
  url_type = extracted_data['url_type']
 
230
 
231
  all_text = f"FACEBOOK DATA ANALYSIS\n{'='*50}\n\n"
232
  all_text += f"πŸ“„ PAGE INFORMATION:\n"
233
  all_text += f"Title: {page_info['title']}\n"
234
- all_text += f"URL: {page_info['url']}\n"
235
  all_text += f"URL Type: {url_type}\n"
236
- all_text += f"Access: {page_info['access_note']}\n"
 
 
 
 
 
 
 
237
  all_text += f"Extracted: {extracted_data['extraction_time']}\n\n"
238
 
239
  all_text += f"πŸ“Š CONTENT ANALYSIS:\n"
240
  all_text += f"Content Blocks: {len(content_blocks)}\n"
241
  all_text += f"Public Content: {sum(1 for b in content_blocks if b['is_public_content'])} blocks\n\n"
242
 
243
- for i, block in enumerate(content_blocks[:10]):
244
  all_text += f"--- BLOCK {i+1} ---\n"
245
  all_text += f"Type: {block['content_type']}\n"
246
  all_text += f"Words: {block['word_count']} | Public: {block['is_public_content']}\n"
@@ -296,15 +433,15 @@ def create_chatbot(vectorstore):
296
  return None
297
 
298
  def main():
299
- st.title("πŸ“˜ Facebook Public Data Extractor")
300
- st.markdown("Extract and analyze public Facebook data - Works on Hugging Face Spaces")
301
 
302
  if st.button("← Back to Main Dashboard"):
303
  st.switch_page("app.py")
304
 
305
  # Initialize session state
306
  if "extractor" not in st.session_state:
307
- st.session_state.extractor = FacebookPublicExtractor()
308
  if "facebook_data" not in st.session_state:
309
  st.session_state.facebook_data = None
310
  if "vectorstore" not in st.session_state:
@@ -320,22 +457,22 @@ def main():
320
 
321
  data_type = st.selectbox(
322
  "Content Type",
323
- ["page", "group", "event", "post", "marketplace"],
324
  help="Select the type of Facebook content"
325
  )
326
 
327
  facebook_url = st.text_input(
328
  "Facebook URL",
329
- placeholder="https://www.facebook.com/Meta/",
330
- help="Enter public Facebook URL (pages work best)"
331
  )
332
 
333
- # Suggested URLs that often work
334
  st.markdown("### πŸš€ Test URLs")
335
  test_urls = {
336
- "Meta (Facebook)": "https://www.facebook.com/Meta/",
337
- "Starbucks": "https://www.facebook.com/Starbucks/",
338
- "NASA": "https://www.facebook.com/NASA/",
339
  }
340
 
341
  for name, url in test_urls.items():
@@ -351,8 +488,8 @@ def main():
351
  elif 'facebook.com' not in url_to_use:
352
  st.error("❌ Please enter a valid Facebook URL")
353
  else:
354
- with st.spinner("πŸ”„ Extracting Facebook data..."):
355
- extracted_data = st.session_state.extractor.extract_public_data(url_to_use, data_type)
356
 
357
  if extracted_data.get("status") == "success":
358
  st.session_state.facebook_data = extracted_data
@@ -363,7 +500,12 @@ def main():
363
  st.session_state.vectorstore = vectorstore
364
  st.session_state.chatbot = create_chatbot(vectorstore)
365
  st.session_state.chat_history = []
366
- st.success(f"βœ… Extracted {len(extracted_data['content_blocks'])} content blocks!")
 
 
 
 
 
367
  else:
368
  st.error("❌ Failed to process data for AI")
369
  else:
@@ -389,52 +531,65 @@ def main():
389
  data = st.session_state.facebook_data
390
  page_info = data['page_info']
391
  content_blocks = data['content_blocks']
 
392
 
393
- st.success("βœ… Facebook Data Extracted")
 
 
 
394
 
395
  # Metrics
396
  col1, col2, col3 = st.columns(3)
397
  with col1:
398
  st.metric("Content Blocks", len(content_blocks))
399
  with col2:
400
- st.metric("Public Content", sum(1 for b in content_blocks if b['is_public_content']))
401
  with col3:
402
- st.metric("Response Code", page_info['response_code'])
403
 
404
  # Page info
405
  st.subheader("🏷️ Page Information")
406
  st.write(f"**Title:** {page_info['title']}")
407
  st.write(f"**URL Type:** {data['url_type']}")
408
- st.write(f"**Access:** {page_info['access_note']}")
409
- st.write(f"**Extracted:** {data['extraction_time'][:19]}")
 
 
 
 
 
 
410
 
411
  # Content samples
412
- st.subheader("πŸ“ Sample Content")
413
- for i, block in enumerate(content_blocks[:3]):
414
- with st.expander(f"Block {i+1} - {block['content_type']} ({block['word_count']} words)"):
415
  st.write(block['content'])
416
- st.caption(f"Public: {block['is_public_content']} | Links: {block['has_links']}")
417
 
418
  else:
419
  st.info("""
420
- ## πŸ“˜ Facebook Public Data Extractor
 
 
421
 
422
  **How it works:**
423
- 1. Enter a public Facebook URL (pages work best)
424
- 2. Click "Extract Facebook Data"
425
- 3. View extracted public content
426
- 4. Chat with AI about the data
427
 
428
- **Supported URLs:**
429
- - 🏒 Public Facebook Pages
430
- - πŸ“˜ Public Groups (limited data)
431
- - πŸŽ‰ Public Events
432
- - πŸ“ Public Posts
433
 
434
- **Limitations:**
435
- - Only public content accessible
436
- - No private group data
437
- - Limited without login
 
438
  """)
439
 
440
  with col2:
@@ -471,10 +626,10 @@ def main():
471
  if not st.session_state.chat_history:
472
  st.subheader("πŸ’‘ Try asking:")
473
  suggestions = [
474
- "What is this Facebook page about?",
475
- "Summarize the available public content",
476
- "What type of content is most common?",
477
- "Analyze the page's public information"
478
  ]
479
 
480
  for suggestion in suggestions:
@@ -482,7 +637,7 @@ def main():
482
  st.info(f"Type: '{suggestion}' in chat")
483
 
484
  elif st.session_state.facebook_data:
485
- st.info("πŸ’¬ Start chatting with the AI about the Facebook data")
486
  else:
487
  st.info("πŸ” Extract Facebook data to enable AI chat")
488
 
 
23
  layout="wide"
24
  )
25
 
26
+ class FacebookDataSimulator:
27
+ """Simulate Facebook data extraction with demo data"""
28
 
29
  def __init__(self):
30
+ self.demo_data = self._create_demo_data()
 
31
 
32
+ def extract_data(self, url: str, data_type: str) -> Dict:
33
+ """Extract or simulate Facebook data"""
 
 
 
 
 
 
 
 
 
 
 
 
34
  try:
35
+ st.info(f"πŸ” Analyzing: {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ # Try real extraction first
38
+ real_data = self._try_real_extraction(url)
39
+ if real_data.get("status") == "success":
40
+ return real_data
 
41
 
42
+ # If real extraction fails, use demo data
43
+ st.warning("⚠️ Using demo data (Facebook restrictions active)")
44
+ return self._get_demo_data(url, data_type)
 
 
 
 
 
 
 
 
 
45
 
46
  except Exception as e:
47
+ st.error(f"Extraction failed, using demo data: {str(e)}")
48
+ return self._get_demo_data(url, data_type)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ def _try_real_extraction(self, url: str) -> Dict:
51
+ """Try real extraction with better error handling"""
52
+ try:
53
+ # Use a proxy-like approach with different user agents
54
+ headers = {
55
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
56
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
57
+ 'Accept-Language': 'en-US,en;q=0.5',
58
+ 'Accept-Encoding': 'gzip, deflate, br',
59
+ 'DNT': '1',
60
+ 'Connection': 'keep-alive',
61
+ 'Upgrade-Insecure-Requests': '1',
62
+ }
63
+
64
+ # Try with shorter timeout
65
+ response = requests.get(url, headers=headers, timeout=10, verify=False)
66
+
67
+ if response.status_code == 200:
68
+ soup = BeautifulSoup(response.text, 'html.parser')
69
+
70
+ # Extract basic info
71
+ title = soup.find('title')
72
+ description = soup.find('meta', attrs={'name': 'description'})
73
+
74
+ return {
75
+ "page_info": {
76
+ "title": title.text if title else "Facebook Content",
77
+ "description": description['content'] if description else "",
78
+ "url": url,
79
+ "response_code": 200,
80
+ "content_length": len(response.text)
81
+ },
82
+ "content_blocks": self._extract_real_content(soup),
83
+ "extraction_time": datetime.now().isoformat(),
84
+ "data_type": "page",
85
+ "status": "success",
86
+ "source": "real"
87
+ }
88
+ else:
89
+ return {"status": "error", "source": "real"}
90
+
91
+ except Exception:
92
+ return {"status": "error", "source": "real"}
93
 
94
+ def _extract_real_content(self, soup) -> List[Dict]:
95
+ """Extract content from real page"""
96
  blocks = []
97
+ text = soup.get_text()
98
+ paragraphs = [p.strip() for p in text.split('.') if p.strip() and len(p.strip()) > 30]
99
 
100
+ for i, paragraph in enumerate(paragraphs[:8]):
101
+ blocks.append({
 
 
 
 
 
102
  "id": i + 1,
103
  "content": paragraph,
104
  "length": len(paragraph),
105
  "word_count": len(paragraph.split()),
106
+ "content_type": "real_content",
107
+ "is_public_content": True
108
+ })
 
 
109
 
110
  return blocks
111
 
112
+ def _get_demo_data(self, url: str, data_type: str) -> Dict:
113
+ """Get realistic demo data based on URL type"""
114
+ url_type = self._analyze_url_type(url)
115
 
116
+ if 'group' in url_type.lower():
117
+ return self._get_group_demo_data(url, data_type)
118
+ elif 'page' in url_type.lower():
119
+ return self._get_page_demo_data(url, data_type)
 
 
 
 
 
 
120
  else:
121
+ return self._get_general_demo_data(url, data_type)
122
 
123
+ def _analyze_url_type(self, url: str) -> str:
124
+ """Analyze URL type for realistic demo data"""
125
+ url_lower = url.lower()
 
 
 
 
126
 
127
+ if 'group' in url_lower:
128
+ return "Facebook Group"
129
+ elif 'page' in url_lower or 'facebook.com/' in url_lower and '/pages/' not in url_lower:
130
+ return "Facebook Page"
131
+ elif 'event' in url_lower:
132
+ return "Facebook Event"
133
+ elif 'marketplace' in url_lower:
134
+ return "Facebook Marketplace"
135
+ else:
136
+ return "Facebook Content"
137
 
138
+ def _get_group_demo_data(self, url: str, data_type: str) -> Dict:
139
+ """Get realistic group demo data"""
140
+ group_name = self._extract_name_from_url(url) or "Gaming Community"
141
 
142
+ return {
143
+ "page_info": {
144
+ "title": f"{group_name} | Facebook Group",
145
+ "description": f"A community of {group_name} enthusiasts sharing content, discussions, and events.",
146
+ "member_count": "15.7K members",
147
+ "url": url,
148
+ "response_code": 200,
149
+ "content_length": 15000,
150
+ "access_note": "Public group - Limited data due to platform restrictions"
151
+ },
152
+ "content_blocks": [
153
+ {
154
+ "id": 1,
155
+ "content": f"Welcome to {group_name}! This is a community for fans and enthusiasts to share their experiences, ask questions, and connect with like-minded people.",
156
+ "length": 120,
157
+ "word_count": 25,
158
+ "content_type": "welcome_message",
159
+ "is_public_content": True
160
+ },
161
+ {
162
+ "id": 2,
163
+ "content": "Just shared my latest project in the group! Would love to get some feedback from the community on the new features we're implementing.",
164
+ "length": 95,
165
+ "word_count": 18,
166
+ "content_type": "member_post",
167
+ "is_public_content": True
168
+ },
169
+ {
170
+ "id": 3,
171
+ "content": "Does anyone have experience with this issue? I've been trying to solve it for a while and could use some community wisdom.",
172
+ "length": 88,
173
+ "word_count": 16,
174
+ "content_type": "question_post",
175
+ "is_public_content": True
176
+ },
177
+ {
178
+ "id": 4,
179
+ "content": "Our monthly meetup is scheduled for next Saturday! Don't forget to RSVP so we can plan accordingly. Looking forward to seeing everyone there.",
180
+ "length": 102,
181
+ "word_count": 19,
182
+ "content_type": "event_announcement",
183
+ "is_public_content": True
184
+ },
185
+ {
186
+ "id": 5,
187
+ "content": "The community guidelines: Be respectful, no spam, keep discussions relevant to the group's topic, and help each other grow.",
188
+ "length": 78,
189
+ "word_count": 14,
190
+ "content_type": "community_guidelines",
191
+ "is_public_content": True
192
+ }
193
+ ],
194
+ "url_type": "Facebook Group",
195
+ "extraction_time": datetime.now().isoformat(),
196
+ "data_type": data_type,
197
+ "status": "success",
198
+ "source": "demo"
199
+ }
200
+
201
+ def _get_page_demo_data(self, url: str, data_type: str) -> Dict:
202
+ """Get realistic page demo data"""
203
+ page_name = self._extract_name_from_url(url) or "Brand Page"
204
+
205
+ return {
206
+ "page_info": {
207
+ "title": f"{page_name} | Facebook Page",
208
+ "description": f"Official Facebook page of {page_name}. Stay updated with our latest news, products, and community events.",
209
+ "follower_count": "45.2K followers",
210
+ "url": url,
211
+ "response_code": 200,
212
+ "content_length": 12000,
213
+ "access_note": "Public page - Limited data due to platform restrictions"
214
+ },
215
+ "content_blocks": [
216
+ {
217
+ "id": 1,
218
+ "content": f"Welcome to the official {page_name} Facebook page! Here you'll find the latest updates, news, and announcements from our team.",
219
+ "length": 98,
220
+ "word_count": 15,
221
+ "content_type": "welcome_message",
222
+ "is_public_content": True
223
+ },
224
+ {
225
+ "id": 2,
226
+ "content": "We're excited to announce our new product launch next week! Stay tuned for more details and special offers for our Facebook community.",
227
+ "length": 92,
228
+ "word_count": 16,
229
+ "content_type": "announcement",
230
+ "is_public_content": True
231
+ },
232
+ {
233
+ "id": 3,
234
+ "content": "Thank you to everyone who participated in our recent event! The feedback has been incredible and we're already planning the next one.",
235
+ "length": 87,
236
+ "word_count": 14,
237
+ "content_type": "event_followup",
238
+ "is_public_content": True
239
+ },
240
+ {
241
+ "id": 4,
242
+ "content": "Customer support hours: Monday-Friday 9AM-6PM. For urgent issues, please message us directly and we'll respond as soon as possible.",
243
+ "length": 85,
244
+ "word_count": 15,
245
+ "content_type": "support_info",
246
+ "is_public_content": True
247
+ }
248
+ ],
249
+ "url_type": "Facebook Page",
250
+ "extraction_time": datetime.now().isoformat(),
251
+ "data_type": data_type,
252
+ "status": "success",
253
+ "source": "demo"
254
+ }
255
+
256
+ def _get_general_demo_data(self, url: str, data_type: str) -> Dict:
257
+ """Get general demo data"""
258
+ return {
259
+ "page_info": {
260
+ "title": "Facebook Content",
261
+ "description": "Social media content and community interactions",
262
+ "url": url,
263
+ "response_code": 200,
264
+ "content_length": 8000,
265
+ "access_note": "Public content - Platform restrictions apply"
266
+ },
267
+ "content_blocks": [
268
+ {
269
+ "id": 1,
270
+ "content": "Community engagement and social interactions are key aspects of this platform. Users share content, connect with friends, and participate in discussions.",
271
+ "length": 105,
272
+ "word_count": 16,
273
+ "content_type": "general_content",
274
+ "is_public_content": True
275
+ },
276
+ {
277
+ "id": 2,
278
+ "content": "Recent updates have improved user experience with better content discovery and enhanced privacy controls for community members.",
279
+ "length": 82,
280
+ "word_count": 12,
281
+ "content_type": "platform_updates",
282
+ "is_public_content": True
283
+ }
284
+ ],
285
+ "url_type": "Facebook Content",
286
+ "extraction_time": datetime.now().isoformat(),
287
+ "data_type": data_type,
288
+ "status": "success",
289
+ "source": "demo"
290
+ }
291
+
292
+ def _extract_name_from_url(self, url: str) -> str:
293
+ """Extract name from URL for realistic demo data"""
294
+ # Extract name from URL for more realistic demo data
295
+ match = re.search(r'facebook\.com/(?:groups/|pages/)?([^/?]+)', url)
296
+ if match:
297
+ name = match.group(1)
298
+ # Clean up the name
299
+ name = name.replace('-', ' ').title()
300
+ return name
301
+ return ""
302
+
303
+ def _create_demo_data(self) -> Dict:
304
+ """Create comprehensive demo data"""
305
+ return {
306
+ "groups": {
307
+ "gamersofbangladesh2": "Gaming Community Bangladesh",
308
+ "programmingcommunity": "Programming Community",
309
+ "startupdiscussions": "Startup Discussions"
310
+ },
311
+ "pages": {
312
+ "meta": "Meta Official",
313
+ "starbucks": "Starbucks Coffee",
314
+ "nasa": "NASA"
315
+ }
316
+ }
317
 
318
  # AI Functions (same as your LinkedIn analyzer)
319
  def get_embeddings():
 
357
  page_info = extracted_data['page_info']
358
  content_blocks = extracted_data['content_blocks']
359
  url_type = extracted_data['url_type']
360
+ source = extracted_data.get('source', 'unknown')
361
 
362
  all_text = f"FACEBOOK DATA ANALYSIS\n{'='*50}\n\n"
363
  all_text += f"πŸ“„ PAGE INFORMATION:\n"
364
  all_text += f"Title: {page_info['title']}\n"
 
365
  all_text += f"URL Type: {url_type}\n"
366
+ all_text += f"Data Source: {source.upper()}\n"
367
+ all_text += f"Access: {page_info.get('access_note', 'Public content')}\n"
368
+
369
+ if page_info.get('member_count'):
370
+ all_text += f"Members: {page_info['member_count']}\n"
371
+ elif page_info.get('follower_count'):
372
+ all_text += f"Followers: {page_info['follower_count']}\n"
373
+
374
  all_text += f"Extracted: {extracted_data['extraction_time']}\n\n"
375
 
376
  all_text += f"πŸ“Š CONTENT ANALYSIS:\n"
377
  all_text += f"Content Blocks: {len(content_blocks)}\n"
378
  all_text += f"Public Content: {sum(1 for b in content_blocks if b['is_public_content'])} blocks\n\n"
379
 
380
+ for i, block in enumerate(content_blocks):
381
  all_text += f"--- BLOCK {i+1} ---\n"
382
  all_text += f"Type: {block['content_type']}\n"
383
  all_text += f"Words: {block['word_count']} | Public: {block['is_public_content']}\n"
 
433
  return None
434
 
435
  def main():
436
+ st.title("πŸ“˜ Facebook Data Extractor")
437
+ st.markdown("**University Project** - Real data when possible, realistic demo data when restricted")
438
 
439
  if st.button("← Back to Main Dashboard"):
440
  st.switch_page("app.py")
441
 
442
  # Initialize session state
443
  if "extractor" not in st.session_state:
444
+ st.session_state.extractor = FacebookDataSimulator()
445
  if "facebook_data" not in st.session_state:
446
  st.session_state.facebook_data = None
447
  if "vectorstore" not in st.session_state:
 
457
 
458
  data_type = st.selectbox(
459
  "Content Type",
460
+ ["group", "page", "event", "post", "general"],
461
  help="Select the type of Facebook content"
462
  )
463
 
464
  facebook_url = st.text_input(
465
  "Facebook URL",
466
+ placeholder="https://www.facebook.com/groups/gamersofbangladesh2",
467
+ help="Enter any Facebook URL for analysis"
468
  )
469
 
470
+ # Quick test URLs
471
  st.markdown("### πŸš€ Test URLs")
472
  test_urls = {
473
+ "Gaming Group": "https://www.facebook.com/groups/gamersofbangladesh2",
474
+ "Tech Community": "https://www.facebook.com/groups/programmingcommunity",
475
+ "Business Page": "https://www.facebook.com/Meta/",
476
  }
477
 
478
  for name, url in test_urls.items():
 
488
  elif 'facebook.com' not in url_to_use:
489
  st.error("❌ Please enter a valid Facebook URL")
490
  else:
491
+ with st.spinner("πŸ”„ Analyzing Facebook data..."):
492
+ extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
493
 
494
  if extracted_data.get("status") == "success":
495
  st.session_state.facebook_data = extracted_data
 
500
  st.session_state.vectorstore = vectorstore
501
  st.session_state.chatbot = create_chatbot(vectorstore)
502
  st.session_state.chat_history = []
503
+
504
+ source = extracted_data.get('source', 'unknown')
505
+ if source == 'demo':
506
+ st.warning("πŸ“ Using realistic demo data (Facebook restrictions active)")
507
+ else:
508
+ st.success("βœ… Real data extracted successfully!")
509
  else:
510
  st.error("❌ Failed to process data for AI")
511
  else:
 
531
  data = st.session_state.facebook_data
532
  page_info = data['page_info']
533
  content_blocks = data['content_blocks']
534
+ source = data.get('source', 'unknown')
535
 
536
+ if source == 'demo':
537
+ st.warning("πŸ“ **Demo Data** - Realistic simulation (Facebook restrictions)")
538
+ else:
539
+ st.success("βœ… **Real Data** - Successfully extracted")
540
 
541
  # Metrics
542
  col1, col2, col3 = st.columns(3)
543
  with col1:
544
  st.metric("Content Blocks", len(content_blocks))
545
  with col2:
546
+ st.metric("Data Source", source.upper())
547
  with col3:
548
+ st.metric("Status", "Success")
549
 
550
  # Page info
551
  st.subheader("🏷️ Page Information")
552
  st.write(f"**Title:** {page_info['title']}")
553
  st.write(f"**URL Type:** {data['url_type']}")
554
+ st.write(f"**Description:** {page_info.get('description', 'No description')}")
555
+
556
+ if page_info.get('member_count'):
557
+ st.write(f"**Members:** {page_info['member_count']}")
558
+ elif page_info.get('follower_count'):
559
+ st.write(f"**Followers:** {page_info['follower_count']}")
560
+
561
+ st.write(f"**Access:** {page_info.get('access_note', 'Public content')}")
562
 
563
  # Content samples
564
+ st.subheader("πŸ“ Content Analysis")
565
+ for i, block in enumerate(content_blocks):
566
+ with st.expander(f"Content {i+1} - {block['content_type']} ({block['word_count']} words)"):
567
  st.write(block['content'])
568
+ st.caption(f"Public: {block['is_public_content']}")
569
 
570
  else:
571
  st.info("""
572
+ ## πŸ“˜ Facebook Data Extractor
573
+
574
+ **University Project Feature**
575
 
576
  **How it works:**
577
+ 1. Enter any Facebook URL
578
+ 2. System tries real data extraction
579
+ 3. If blocked, uses **realistic demo data**
580
+ 4. Full AI analysis available
581
 
582
+ **Features:**
583
+ - Real data extraction when possible
584
+ - Realistic demo data when restricted
585
+ - Full AI-powered analysis
586
+ - Professional interface
587
 
588
+ **Perfect for demonstrating:**
589
+ - Social media data extraction concepts
590
+ - AI analysis capabilities
591
+ - Platform integration
592
+ - Error handling strategies
593
  """)
594
 
595
  with col2:
 
626
  if not st.session_state.chat_history:
627
  st.subheader("πŸ’‘ Try asking:")
628
  suggestions = [
629
+ "What is this Facebook group/page about?",
630
+ "Summarize the main content and purpose",
631
+ "What kind of community is this?",
632
+ "Analyze the engagement and activity level"
633
  ]
634
 
635
  for suggestion in suggestions:
 
637
  st.info(f"Type: '{suggestion}' in chat")
638
 
639
  elif st.session_state.facebook_data:
640
+ st.info("πŸ’¬ Start chatting with AI about the Facebook data")
641
  else:
642
  st.info("πŸ” Extract Facebook data to enable AI chat")
643