Refat81 commited on
Commit
fd2cc7f
Β·
verified Β·
1 Parent(s): 47ac751

Update pages/facebook_extractor.py

Browse files
Files changed (1) hide show
  1. pages/facebook_extractor.py +121 -541
pages/facebook_extractor.py CHANGED
@@ -9,14 +9,13 @@ from typing import List, Dict
9
  import os
10
  import tempfile
11
 
12
- # Import your existing AI components
13
- from langchain_text_splitters import CharacterTextSplitter
14
- from langchain_community.embeddings import HuggingFaceEmbeddings
15
  from langchain.vectorstores import FAISS
16
  from langchain.memory import ConversationBufferMemory
17
  from langchain.chains import ConversationalRetrievalChain
18
  from langchain.schema import Document
19
- from langchain_community.llms import HuggingFaceHub
20
 
21
  st.set_page_config(
22
  page_title="Facebook Data Extractor",
@@ -35,12 +34,10 @@ class FacebookDataSimulator:
35
  try:
36
  st.info(f"πŸ” Analyzing: {url}")
37
 
38
- # Try real extraction first
39
  real_data = self._try_real_extraction(url)
40
  if real_data.get("status") == "success":
41
  return real_data
42
 
43
- # If real extraction fails, use demo data
44
  st.warning("⚠️ Using demo data (Facebook restrictions active)")
45
  return self._get_demo_data(url, data_type)
46
 
@@ -49,29 +46,15 @@ class FacebookDataSimulator:
49
  return self._get_demo_data(url, data_type)
50
 
51
  def _try_real_extraction(self, url: str) -> Dict:
52
- """Try real extraction with better error handling"""
53
  try:
54
- # Use a proxy-like approach with different user agents
55
  headers = {
56
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
57
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
58
- 'Accept-Language': 'en-US,en;q=0.5',
59
- 'Accept-Encoding': 'gzip, deflate, br',
60
- 'DNT': '1',
61
- 'Connection': 'keep-alive',
62
- 'Upgrade-Insecure-Requests': '1',
63
  }
64
-
65
- # Try with shorter timeout
66
  response = requests.get(url, headers=headers, timeout=10, verify=False)
67
-
68
  if response.status_code == 200:
69
  soup = BeautifulSoup(response.text, 'html.parser')
70
-
71
- # Extract basic info
72
  title = soup.find('title')
73
  description = soup.find('meta', attrs={'name': 'description'})
74
-
75
  return {
76
  "page_info": {
77
  "title": title.text if title else "Facebook Content",
@@ -88,16 +71,13 @@ class FacebookDataSimulator:
88
  }
89
  else:
90
  return {"status": "error", "source": "real"}
91
-
92
  except Exception:
93
  return {"status": "error", "source": "real"}
94
 
95
  def _extract_real_content(self, soup) -> List[Dict]:
96
- """Extract content from real page"""
97
  blocks = []
98
  text = soup.get_text()
99
  paragraphs = [p.strip() for p in text.split('.') if p.strip() and len(p.strip()) > 30]
100
-
101
  for i, paragraph in enumerate(paragraphs[:8]):
102
  blocks.append({
103
  "id": i + 1,
@@ -107,13 +87,10 @@ class FacebookDataSimulator:
107
  "content_type": "real_content",
108
  "is_public_content": True
109
  })
110
-
111
  return blocks
112
 
113
  def _get_demo_data(self, url: str, data_type: str) -> Dict:
114
- """Get realistic demo data based on URL type"""
115
  url_type = self._analyze_url_type(url)
116
-
117
  if 'group' in url_type.lower():
118
  return self._get_group_demo_data(url, data_type)
119
  elif 'page' in url_type.lower():
@@ -122,9 +99,7 @@ class FacebookDataSimulator:
122
  return self._get_general_demo_data(url, data_type)
123
 
124
  def _analyze_url_type(self, url: str) -> str:
125
- """Analyze URL type for realistic demo data"""
126
  url_lower = url.lower()
127
-
128
  if 'group' in url_lower:
129
  return "Facebook Group"
130
  elif 'page' in url_lower or 'facebook.com/' in url_lower and '/pages/' not in url_lower:
@@ -137,9 +112,7 @@ class FacebookDataSimulator:
137
  return "Facebook Content"
138
 
139
  def _get_group_demo_data(self, url: str, data_type: str) -> Dict:
140
- """Get realistic group demo data"""
141
  group_name = self._extract_name_from_url(url) or "Gaming Community"
142
-
143
  return {
144
  "page_info": {
145
  "title": f"{group_name} | Facebook Group",
@@ -151,46 +124,11 @@ class FacebookDataSimulator:
151
  "access_note": "Public group - Limited data due to platform restrictions"
152
  },
153
  "content_blocks": [
154
- {
155
- "id": 1,
156
- "content": f"Welcome to {group_name}! This is a community for fans and enthusiasts to share their experiences, ask questions, and connect with like-minded people.",
157
- "length": 120,
158
- "word_count": 25,
159
- "content_type": "welcome_message",
160
- "is_public_content": True
161
- },
162
- {
163
- "id": 2,
164
- "content": "Just shared my latest project in the group! Would love to get some feedback from the community on the new features we're implementing.",
165
- "length": 95,
166
- "word_count": 18,
167
- "content_type": "member_post",
168
- "is_public_content": True
169
- },
170
- {
171
- "id": 3,
172
- "content": "Does anyone have experience with this issue? I've been trying to solve it for a while and could use some community wisdom.",
173
- "length": 88,
174
- "word_count": 16,
175
- "content_type": "question_post",
176
- "is_public_content": True
177
- },
178
- {
179
- "id": 4,
180
- "content": "Our monthly meetup is scheduled for next Saturday! Don't forget to RSVP so we can plan accordingly. Looking forward to seeing everyone there.",
181
- "length": 102,
182
- "word_count": 19,
183
- "content_type": "event_announcement",
184
- "is_public_content": True
185
- },
186
- {
187
- "id": 5,
188
- "content": "The community guidelines: Be respectful, no spam, keep discussions relevant to the group's topic, and help each other grow.",
189
- "length": 78,
190
- "word_count": 14,
191
- "content_type": "community_guidelines",
192
- "is_public_content": True
193
- }
194
  ],
195
  "url_type": "Facebook Group",
196
  "extraction_time": datetime.now().isoformat(),
@@ -200,9 +138,7 @@ class FacebookDataSimulator:
200
  }
201
 
202
  def _get_page_demo_data(self, url: str, data_type: str) -> Dict:
203
- """Get realistic page demo data"""
204
  page_name = self._extract_name_from_url(url) or "Brand Page"
205
-
206
  return {
207
  "page_info": {
208
  "title": f"{page_name} | Facebook Page",
@@ -214,38 +150,10 @@ class FacebookDataSimulator:
214
  "access_note": "Public page - Limited data due to platform restrictions"
215
  },
216
  "content_blocks": [
217
- {
218
- "id": 1,
219
- "content": f"Welcome to the official {page_name} Facebook page! Here you'll find the latest updates, news, and announcements from our team.",
220
- "length": 98,
221
- "word_count": 15,
222
- "content_type": "welcome_message",
223
- "is_public_content": True
224
- },
225
- {
226
- "id": 2,
227
- "content": "We're excited to announce our new product launch next week! Stay tuned for more details and special offers for our Facebook community.",
228
- "length": 92,
229
- "word_count": 16,
230
- "content_type": "announcement",
231
- "is_public_content": True
232
- },
233
- {
234
- "id": 3,
235
- "content": "Thank you to everyone who participated in our recent event! The feedback has been incredible and we're already planning the next one.",
236
- "length": 87,
237
- "word_count": 14,
238
- "content_type": "event_followup",
239
- "is_public_content": True
240
- },
241
- {
242
- "id": 4,
243
- "content": "Customer support hours: Monday-Friday 9AM-6PM. For urgent issues, please message us directly and we'll respond as soon as possible.",
244
- "length": 85,
245
- "word_count": 15,
246
- "content_type": "support_info",
247
- "is_public_content": True
248
- }
249
  ],
250
  "url_type": "Facebook Page",
251
  "extraction_time": datetime.now().isoformat(),
@@ -255,7 +163,6 @@ class FacebookDataSimulator:
255
  }
256
 
257
  def _get_general_demo_data(self, url: str, data_type: str) -> Dict:
258
- """Get general demo data"""
259
  return {
260
  "page_info": {
261
  "title": "Facebook Content",
@@ -266,22 +173,8 @@ class FacebookDataSimulator:
266
  "access_note": "Public content - Platform restrictions apply"
267
  },
268
  "content_blocks": [
269
- {
270
- "id": 1,
271
- "content": "Community engagement and social interactions are key aspects of this platform. Users share content, connect with friends, and participate in discussions.",
272
- "length": 105,
273
- "word_count": 16,
274
- "content_type": "general_content",
275
- "is_public_content": True
276
- },
277
- {
278
- "id": 2,
279
- "content": "Recent updates have improved user experience with better content discovery and enhanced privacy controls for community members.",
280
- "length": 82,
281
- "word_count": 12,
282
- "content_type": "platform_updates",
283
- "is_public_content": True
284
- }
285
  ],
286
  "url_type": "Facebook Content",
287
  "extraction_time": datetime.now().isoformat(),
@@ -291,18 +184,14 @@ class FacebookDataSimulator:
291
  }
292
 
293
  def _extract_name_from_url(self, url: str) -> str:
294
- """Extract name from URL for realistic demo data"""
295
- # Extract name from URL for more realistic demo data
296
  match = re.search(r'facebook\.com/(?:groups/|pages/)?([^/?]+)', url)
297
  if match:
298
  name = match.group(1)
299
- # Clean up the name
300
  name = name.replace('-', ' ').title()
301
  return name
302
  return ""
303
-
304
  def _create_demo_data(self) -> Dict:
305
- """Create comprehensive demo data"""
306
  return {
307
  "groups": {
308
  "gamersofbangladesh2": "Gaming Community Bangladesh",
@@ -316,252 +205,99 @@ class FacebookDataSimulator:
316
  }
317
  }
318
 
 
 
319
  def get_embeddings():
320
- """Initialize embeddings with better error handling and cache management"""
321
- try:
322
- # Try multiple embedding models with different cache directories
323
- model_options = [
324
- "sentence-transformers/all-MiniLM-L6-v2",
325
- "sentence-transformers/paraphrase-MiniLM-L3-v2",
326
- "sentence-transformers/all-mpnet-base-v2"
327
- ]
328
-
329
- for model_name in model_options:
330
- try:
331
- st.info(f"πŸ”„ Trying embedding model: {model_name}")
332
-
333
- # Use temporary directory for cache to avoid permission issues
334
- with tempfile.TemporaryDirectory() as temp_cache:
335
- embeddings = HuggingFaceEmbeddings(
336
- model_name=model_name,
337
- cache_folder=temp_cache,
338
- model_kwargs={'device': 'cpu'}
339
- )
340
-
341
- # Test the embeddings
342
- test_text = "Hello world"
343
- test_embedding = embeddings.embed_query(test_text)
344
- if test_embedding and len(test_embedding) > 0:
345
- st.success(f"βœ… Loaded embeddings: {model_name.split('/')[-1]}")
346
- return embeddings
347
-
348
- except Exception as e:
349
- st.warning(f"⚠️ Failed to load {model_name}: {str(e)}")
350
- continue
351
-
352
- # If all models fail, try without cache
353
- st.warning("πŸ”„ Trying fallback embedding method...")
354
- try:
355
- embeddings = HuggingFaceEmbeddings(
356
- model_name="sentence-transformers/all-MiniLM-L6-v2"
357
- )
358
- st.success("βœ… Loaded fallback embeddings")
359
- return embeddings
360
- except Exception as e:
361
- st.error(f"❌ All embedding models failed: {e}")
362
- return None
363
-
364
- except Exception as e:
365
- st.error(f"❌ Embeddings error: {e}")
366
  return None
367
 
 
 
 
 
 
 
 
 
368
  def get_llm():
369
- """Initialize HuggingFace LLM"""
370
- try:
371
- api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
372
- if not api_key:
373
- st.error("HuggingFace API Key not found")
374
- return None
375
-
376
- # Try multiple models
377
- model_options = [
378
- "mistralai/Mistral-7B-Instruct-v0.1",
379
- "google/flan-t5-large",
380
- "microsoft/DialoGPT-large"
381
- ]
382
-
383
- for model_id in model_options:
384
- try:
385
- st.info(f"πŸ”„ Trying LLM: {model_id}")
386
-
387
- llm = HuggingFaceHub(
388
- repo_id=model_id,
389
- huggingfacehub_api_token=api_key,
390
- model_kwargs={
391
- "temperature": 0.7,
392
- "max_length": 512,
393
- "max_new_tokens": 256,
394
- }
395
- )
396
-
397
- # Test the model
398
- test_response = llm.invoke("Hello")
399
- if test_response and len(test_response.strip()) > 0:
400
- st.success(f"βœ… Loaded LLM: {model_id.split('/')[-1]}")
401
- return llm
402
-
403
- except Exception as e:
404
- st.warning(f"⚠️ Failed to load {model_id}: {str(e)}")
405
- continue
406
-
407
- st.error("❌ All LLMs failed to load")
408
- return None
409
-
410
- except Exception as e:
411
- st.error(f"❌ LLM error: {e}")
412
  return None
413
 
 
 
 
 
 
 
 
 
414
  def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
415
- """Simple rule-based chat analysis when embeddings fail"""
416
  try:
417
  if not extracted_data:
418
- return "No data available for analysis."
419
 
420
  page_info = extracted_data.get('page_info', {})
421
  content_blocks = extracted_data.get('content_blocks', [])
422
  url_type = extracted_data.get('url_type', 'Facebook Content')
423
  source = extracted_data.get('source', 'demo')
424
-
425
  user_input_lower = user_input.lower()
426
-
427
- # Basic analysis based on input
428
- if any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
429
- return f"""**πŸ“Š Summary of {page_info.get('title', 'Facebook Content')}**
430
-
431
- **Type:** {url_type}
432
- **Data Source:** {source.upper()}
433
- **Description:** {page_info.get('description', 'No description available')}
434
-
435
- This appears to be a {url_type.lower()} with {len(content_blocks)} content blocks of public information.
436
-
437
- **Key Content Types:**
438
- {', '.join(set(block['content_type'] for block in content_blocks))}
439
-
440
- The content focuses on community engagement and social interactions."""
441
-
442
- elif any(word in user_input_lower for word in ['purpose', 'about', 'what is']):
443
- return f"""**🎯 Purpose Analysis**
444
-
445
- Based on the extracted data, this {url_type.lower()} appears to be focused on:
446
-
447
- - **Community Building:** {len([b for b in content_blocks if 'community' in b['content_type'].lower()])} community-related posts
448
- - **Information Sharing:** {len([b for b in content_blocks if 'announcement' in b['content_type'].lower()])} announcements
449
- - **Member Engagement:** {len([b for b in content_blocks if 'post' in b['content_type'].lower()])} member posts
450
-
451
- **Overall Purpose:** {page_info.get('description', 'Community engagement and content sharing')}"""
452
-
453
- elif any(word in user_input_lower for word in ['activity', 'engagement', 'active']):
454
- active_blocks = len([b for b in content_blocks if any(word in b['content_type'].lower() for word in ['post', 'question', 'event'])])
455
- return f"""**πŸ“ˆ Activity Analysis**
456
-
457
- **Content Activity Level:**
458
- - Total Content Blocks: {len(content_blocks)}
459
- - Active Engagement Posts: {active_blocks}
460
- - Informational Posts: {len(content_blocks) - active_blocks}
461
 
462
- The {url_type.lower()} shows a good mix of member engagement and informational content, suggesting an active community."""
463
-
 
 
464
  else:
465
- return f"""**πŸ€– Analysis Response**
466
-
467
- I've analyzed the {url_type.lower()} data for you.
468
-
469
- **Your question:** "{user_input}"
470
- **Content Source:** {source.upper()} data
471
- **Content Type:** {url_type}
472
-
473
- This {url_type.lower()} contains {len(content_blocks)} pieces of content focusing on community engagement and information sharing.
474
-
475
- **Try asking:**
476
- - "What is the main purpose of this group/page?"
477
- - "Summarize the content and activities"
478
- - "What kind of engagement does this content show?""""
479
-
480
  except Exception as e:
481
  return f"Analysis error: {str(e)}"
482
 
483
  def process_facebook_data(extracted_data):
484
- """Process extracted data for AI analysis with fallbacks"""
485
  if not extracted_data or extracted_data.get("status") != "success":
486
  return None, []
487
-
488
- page_info = extracted_data['page_info']
489
- content_blocks = extracted_data['content_blocks']
490
- url_type = extracted_data['url_type']
491
- source = extracted_data.get('source', 'unknown')
492
-
493
- all_text = f"FACEBOOK DATA ANALYSIS\n{'='*50}\n\n"
494
- all_text += f"πŸ“„ PAGE INFORMATION:\n"
495
- all_text += f"Title: {page_info['title']}\n"
496
- all_text += f"URL Type: {url_type}\n"
497
- all_text += f"Data Source: {source.upper()}\n"
498
- all_text += f"Access: {page_info.get('access_note', 'Public content')}\n"
499
-
500
- if page_info.get('member_count'):
501
- all_text += f"Members: {page_info['member_count']}\n"
502
- elif page_info.get('follower_count'):
503
- all_text += f"Followers: {page_info['follower_count']}\n"
504
-
505
- all_text += f"Extracted: {extracted_data['extraction_time']}\n\n"
506
-
507
- all_text += f"πŸ“Š CONTENT ANALYSIS:\n"
508
- all_text += f"Content Blocks: {len(content_blocks)}\n"
509
- all_text += f"Public Content: {sum(1 for b in content_blocks if b['is_public_content'])} blocks\n\n"
510
-
511
- for i, block in enumerate(content_blocks):
512
- all_text += f"--- BLOCK {i+1} ---\n"
513
- all_text += f"Type: {block['content_type']}\n"
514
- all_text += f"Words: {block['word_count']} | Public: {block['is_public_content']}\n"
515
- all_text += f"Content: {block['content']}\n\n"
516
-
517
- all_text += "="*50
518
-
519
- # Split into chunks
520
- splitter = CharacterTextSplitter(
521
- separator="\n",
522
- chunk_size=1000,
523
- chunk_overlap=200,
524
- length_function=len
525
- )
526
-
527
  chunks = splitter.split_text(all_text)
528
  documents = [Document(page_content=chunk) for chunk in chunks]
529
-
530
- return "simple", documents # Return simple mode instead of vectorstore
 
 
 
 
 
531
 
532
  def create_chatbot(vectorstore):
533
- """Create conversational chatbot"""
534
- try:
535
- llm = get_llm()
536
- if llm is None:
537
- return "simple" # Return simple mode if LLM fails
538
-
539
- memory = ConversationBufferMemory(
540
- memory_key="chat_history",
541
- return_messages=True,
542
- output_key="answer"
543
- )
544
-
545
- chain = ConversationalRetrievalChain.from_llm(
546
- llm=llm,
547
- retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
548
- memory=memory,
549
- return_source_documents=True,
550
- output_key="answer"
551
- )
552
- return chain
553
- except Exception as e:
554
- st.error(f"Chatbot creation failed: {str(e)}")
555
- return "simple" # Fallback to simple mode
556
 
557
  def main():
558
- st.title("πŸ“˜ Facebook Data Extractor")
559
- st.markdown("**University Project** - Real data when possible, realistic demo data when restricted")
560
 
561
  if st.button("← Back to Main Dashboard"):
562
  st.switch_page("app.py")
563
-
564
- # Initialize session state
565
  if "extractor" not in st.session_state:
566
  st.session_state.extractor = FacebookDataSimulator()
567
  if "facebook_data" not in st.session_state:
@@ -573,225 +309,69 @@ def main():
573
  if "chat_history" not in st.session_state:
574
  st.session_state.chat_history = []
575
  if "processing_mode" not in st.session_state:
576
- st.session_state.processing_mode = "ai" # ai or simple
577
-
578
  # Sidebar
579
  with st.sidebar:
580
  st.header("βš™οΈ Facebook Configuration")
581
-
582
- data_type = st.selectbox(
583
- "Content Type",
584
- ["group", "page", "event", "post", "general"],
585
- help="Select the type of Facebook content"
586
- )
587
-
588
- facebook_url = st.text_input(
589
- "Facebook URL",
590
- placeholder="https://www.facebook.com/groups/gamersofbangladesh2",
591
- help="Enter any Facebook URL for analysis"
592
- )
593
-
594
- # Processing mode
595
- st.subheader("πŸ”§ Processing Mode")
596
- processing_mode = st.radio(
597
- "Choose analysis mode:",
598
- ["AI Analysis (Recommended)", "Simple Analysis"],
599
- help="AI Analysis uses embeddings, Simple uses rule-based"
600
- )
601
-
602
- st.session_state.processing_mode = "ai" if processing_mode == "AI Analysis (Recommended)" else "simple"
603
-
604
- # Quick test URLs
605
- st.markdown("### πŸš€ Test URLs")
606
- test_urls = {
607
- "Gaming Group": "https://www.facebook.com/groups/gamersofbangladesh2",
608
- "Tech Community": "https://www.facebook.com/groups/programmingcommunity",
609
- "Business Page": "https://www.facebook.com/Meta/",
610
- }
611
-
612
- for name, url in test_urls.items():
613
- if st.button(f"πŸ”— {name}", key=f"fb_{name}"):
614
- st.session_state.current_fb_url = url
615
- st.rerun()
616
-
617
- if st.button("πŸš€ Extract Facebook Data", type="primary"):
618
- url_to_use = facebook_url or getattr(st.session_state, 'current_fb_url', '')
619
-
620
- if not url_to_use:
621
- st.error("❌ Please enter a Facebook URL")
622
- elif 'facebook.com' not in url_to_use:
623
- st.error("❌ Please enter a valid Facebook URL")
624
  else:
625
  with st.spinner("πŸ”„ Analyzing Facebook data..."):
626
  extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
627
-
628
  if extracted_data.get("status") == "success":
629
  st.session_state.facebook_data = extracted_data
630
-
631
- # Process based on selected mode
632
- if st.session_state.processing_mode == "ai":
633
- result = process_facebook_data(extracted_data)
634
- if result and result[0] != "simple":
635
- st.session_state.vectorstore = result[0]
636
- st.session_state.chatbot = create_chatbot(result[0])
637
- st.session_state.chat_history = []
638
- st.success("βœ… AI analysis ready!")
639
  else:
640
- st.warning("⚠️ Using simple analysis (AI features limited)")
641
  st.session_state.chatbot = "simple"
642
- st.session_state.chat_history = []
643
  else:
644
  st.session_state.chatbot = "simple"
645
- st.session_state.chat_history = []
646
- st.success("βœ… Simple analysis ready!")
647
-
648
- source = extracted_data.get('source', 'unknown')
649
- if source == 'demo':
650
- st.warning("πŸ“ Using realistic demo data (Facebook restrictions active)")
651
- else:
652
- st.success("βœ… Real data extracted successfully!")
653
  else:
654
- error_msg = extracted_data.get("error", "Unknown error")
655
- st.error(f"❌ Extraction failed: {error_msg}")
656
-
657
- if st.session_state.facebook_data:
658
- st.markdown("---")
659
- if st.button("πŸ—‘οΈ Clear Data", type="secondary"):
660
- st.session_state.facebook_data = None
661
- st.session_state.vectorstore = None
662
- st.session_state.chatbot = None
663
- st.session_state.chat_history = []
664
- st.rerun()
665
-
666
- # Main content
667
- col1, col2 = st.columns([1, 1])
668
-
669
  with col1:
670
  st.header("πŸ“Š Extraction Results")
671
-
672
  if st.session_state.facebook_data:
673
  data = st.session_state.facebook_data
674
- page_info = data['page_info']
675
- content_blocks = data['content_blocks']
676
- source = data.get('source', 'unknown')
677
-
678
- if source == 'demo':
679
- st.warning("πŸ“ **Demo Data** - Realistic simulation (Facebook restrictions)")
680
- else:
681
- st.success("βœ… **Real Data** - Successfully extracted")
682
-
683
- # Show processing mode
684
- if st.session_state.processing_mode == "simple":
685
- st.info("πŸ”§ **Simple Analysis Mode** - Rule-based processing")
686
- else:
687
- st.info("πŸ€– **AI Analysis Mode** - Embedding-based processing")
688
-
689
- # Metrics
690
- col1, col2, col3 = st.columns(3)
691
- with col1:
692
- st.metric("Content Blocks", len(content_blocks))
693
- with col2:
694
- st.metric("Data Source", source.upper())
695
- with col3:
696
- st.metric("Analysis Mode", "AI" if st.session_state.processing_mode == "ai" else "Simple")
697
-
698
- # Page info
699
- st.subheader("🏷️ Page Information")
700
  st.write(f"**Title:** {page_info['title']}")
701
- st.write(f"**URL Type:** {data['url_type']}")
702
- st.write(f"**Description:** {page_info.get('description', 'No description')}")
703
-
704
- if page_info.get('member_count'):
705
- st.write(f"**Members:** {page_info['member_count']}")
706
- elif page_info.get('follower_count'):
707
- st.write(f"**Followers:** {page_info['follower_count']}")
708
-
709
- st.write(f"**Access:** {page_info.get('access_note', 'Public content')}")
710
-
711
- # Content samples
712
- st.subheader("πŸ“ Content Analysis")
713
- for i, block in enumerate(content_blocks):
714
- with st.expander(f"Content {i+1} - {block['content_type']} ({block['word_count']} words)"):
715
- st.write(block['content'])
716
- st.caption(f"Public: {block['is_public_content']}")
717
-
718
- else:
719
- st.info("""
720
- ## πŸ“˜ Facebook Data Extractor
721
-
722
- **University Project Feature**
723
-
724
- **How it works:**
725
- 1. Enter any Facebook URL
726
- 2. System tries real data extraction
727
- 3. If blocked, uses **realistic demo data**
728
- 4. Choose between AI or Simple analysis
729
-
730
- **Analysis Modes:**
731
- - πŸ€– **AI Analysis**: Uses embeddings and Mistral AI
732
- - πŸ”§ **Simple Analysis**: Rule-based (works without embeddings)
733
-
734
- **Perfect for demonstrating:**
735
- - Social media data extraction concepts
736
- - AI analysis capabilities
737
- - Platform integration
738
- - Error handling strategies
739
- """)
740
 
741
  with col2:
742
- st.header("πŸ’¬ Analysis Chat")
743
-
744
- if st.session_state.chatbot and st.session_state.facebook_data:
745
- # Display chat history
746
- for chat in st.session_state.chat_history:
747
- if chat["role"] == "user":
748
- with st.chat_message("user"):
749
- st.write(chat['content'])
750
- elif chat["role"] == "assistant":
751
- with st.chat_message("assistant"):
752
- st.write(chat['content'])
753
-
754
- # Chat input
755
- user_input = st.chat_input("Ask about the Facebook data...")
756
-
757
  if user_input:
758
- st.session_state.chat_history.append({"role": "user", "content": user_input})
759
-
760
- with st.spinner("πŸ€” Analyzing..."):
761
- try:
762
- if st.session_state.chatbot == "simple":
763
- # Use simple analysis
764
- response = simple_chat_analysis(user_input, st.session_state.facebook_data)
765
- st.session_state.chat_history.append({"role": "assistant", "content": response})
766
- else:
767
- # Use AI chatbot
768
- response = st.session_state.chatbot.invoke({"question": user_input})
769
- answer = response.get("answer", "I couldn't generate a response.")
770
- st.session_state.chat_history.append({"role": "assistant", "content": answer})
771
- st.rerun()
772
- except Exception as e:
773
- error_msg = f"Analysis Error: {str(e)}"
774
- st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
775
- st.rerun()
776
-
777
- # Suggested questions
778
- if not st.session_state.chat_history:
779
- st.subheader("πŸ’‘ Try asking:")
780
- suggestions = [
781
- "What is this Facebook group/page about?",
782
- "Summarize the main content and purpose",
783
- "What kind of community is this?",
784
- "Analyze the engagement and activity level"
785
- ]
786
-
787
- for suggestion in suggestions:
788
- if st.button(suggestion, key=f"fb_suggest_{suggestion}"):
789
- st.info(f"Type: '{suggestion}' in chat")
790
-
791
- elif st.session_state.facebook_data:
792
- st.info("πŸ’¬ Start chatting about the Facebook data")
793
- else:
794
- st.info("πŸ” Extract Facebook data to enable analysis")
795
 
796
- if __name__ == "__main__":
797
- main()
 
9
  import os
10
  import tempfile
11
 
12
+ from langchain.text_splitter import CharacterTextSplitter
13
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
 
14
  from langchain.vectorstores import FAISS
15
  from langchain.memory import ConversationBufferMemory
16
  from langchain.chains import ConversationalRetrievalChain
17
  from langchain.schema import Document
18
+ from langchain.chat_models import ChatHuggingFaceHub
19
 
20
  st.set_page_config(
21
  page_title="Facebook Data Extractor",
 
34
  try:
35
  st.info(f"πŸ” Analyzing: {url}")
36
 
 
37
  real_data = self._try_real_extraction(url)
38
  if real_data.get("status") == "success":
39
  return real_data
40
 
 
41
  st.warning("⚠️ Using demo data (Facebook restrictions active)")
42
  return self._get_demo_data(url, data_type)
43
 
 
46
  return self._get_demo_data(url, data_type)
47
 
48
  def _try_real_extraction(self, url: str) -> Dict:
 
49
  try:
 
50
  headers = {
51
+ 'User-Agent': 'Mozilla/5.0',
 
 
 
 
 
 
52
  }
 
 
53
  response = requests.get(url, headers=headers, timeout=10, verify=False)
 
54
  if response.status_code == 200:
55
  soup = BeautifulSoup(response.text, 'html.parser')
 
 
56
  title = soup.find('title')
57
  description = soup.find('meta', attrs={'name': 'description'})
 
58
  return {
59
  "page_info": {
60
  "title": title.text if title else "Facebook Content",
 
71
  }
72
  else:
73
  return {"status": "error", "source": "real"}
 
74
  except Exception:
75
  return {"status": "error", "source": "real"}
76
 
77
  def _extract_real_content(self, soup) -> List[Dict]:
 
78
  blocks = []
79
  text = soup.get_text()
80
  paragraphs = [p.strip() for p in text.split('.') if p.strip() and len(p.strip()) > 30]
 
81
  for i, paragraph in enumerate(paragraphs[:8]):
82
  blocks.append({
83
  "id": i + 1,
 
87
  "content_type": "real_content",
88
  "is_public_content": True
89
  })
 
90
  return blocks
91
 
92
  def _get_demo_data(self, url: str, data_type: str) -> Dict:
 
93
  url_type = self._analyze_url_type(url)
 
94
  if 'group' in url_type.lower():
95
  return self._get_group_demo_data(url, data_type)
96
  elif 'page' in url_type.lower():
 
99
  return self._get_general_demo_data(url, data_type)
100
 
101
  def _analyze_url_type(self, url: str) -> str:
 
102
  url_lower = url.lower()
 
103
  if 'group' in url_lower:
104
  return "Facebook Group"
105
  elif 'page' in url_lower or 'facebook.com/' in url_lower and '/pages/' not in url_lower:
 
112
  return "Facebook Content"
113
 
114
  def _get_group_demo_data(self, url: str, data_type: str) -> Dict:
 
115
  group_name = self._extract_name_from_url(url) or "Gaming Community"
 
116
  return {
117
  "page_info": {
118
  "title": f"{group_name} | Facebook Group",
 
124
  "access_note": "Public group - Limited data due to platform restrictions"
125
  },
126
  "content_blocks": [
127
+ {"id": 1, "content": f"Welcome to {group_name}! This is a community for fans and enthusiasts to share their experiences, ask questions, and connect with like-minded people.", "length": 120, "word_count": 25, "content_type": "welcome_message", "is_public_content": True},
128
+ {"id": 2, "content": "Just shared my latest project in the group! Would love to get some feedback from the community on the new features we're implementing.", "length": 95, "word_count": 18, "content_type": "member_post", "is_public_content": True},
129
+ {"id": 3, "content": "Does anyone have experience with this issue? I've been trying to solve it for a while and could use some community wisdom.", "length": 88, "word_count": 16, "content_type": "question_post", "is_public_content": True},
130
+ {"id": 4, "content": "Our monthly meetup is scheduled for next Saturday! Don't forget to RSVP so we can plan accordingly. Looking forward to seeing everyone there.", "length": 102, "word_count": 19, "content_type": "event_announcement", "is_public_content": True},
131
+ {"id": 5, "content": "The community guidelines: Be respectful, no spam, keep discussions relevant to the group's topic, and help each other grow.", "length": 78, "word_count": 14, "content_type": "community_guidelines", "is_public_content": True}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  ],
133
  "url_type": "Facebook Group",
134
  "extraction_time": datetime.now().isoformat(),
 
138
  }
139
 
140
  def _get_page_demo_data(self, url: str, data_type: str) -> Dict:
 
141
  page_name = self._extract_name_from_url(url) or "Brand Page"
 
142
  return {
143
  "page_info": {
144
  "title": f"{page_name} | Facebook Page",
 
150
  "access_note": "Public page - Limited data due to platform restrictions"
151
  },
152
  "content_blocks": [
153
+ {"id": 1, "content": f"Welcome to the official {page_name} Facebook page! Here you'll find the latest updates, news, and announcements from our team.", "length": 98, "word_count": 15, "content_type": "welcome_message", "is_public_content": True},
154
+ {"id": 2, "content": "We're excited to announce our new product launch next week! Stay tuned for more details and special offers for our Facebook community.", "length": 92, "word_count": 16, "content_type": "announcement", "is_public_content": True},
155
+ {"id": 3, "content": "Thank you to everyone who participated in our recent event! The feedback has been incredible and we're already planning the next one.", "length": 87, "word_count": 14, "content_type": "event_followup", "is_public_content": True},
156
+ {"id": 4, "content": "Customer support hours: Monday-Friday 9AM-6PM. For urgent issues, please message us directly and we'll respond as soon as possible.", "length": 85, "word_count": 15, "content_type": "support_info", "is_public_content": True}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  ],
158
  "url_type": "Facebook Page",
159
  "extraction_time": datetime.now().isoformat(),
 
163
  }
164
 
165
  def _get_general_demo_data(self, url: str, data_type: str) -> Dict:
 
166
  return {
167
  "page_info": {
168
  "title": "Facebook Content",
 
173
  "access_note": "Public content - Platform restrictions apply"
174
  },
175
  "content_blocks": [
176
+ {"id": 1, "content": "Community engagement and social interactions are key aspects of this platform. Users share content, connect with friends, and participate in discussions.", "length": 105, "word_count": 16, "content_type": "general_content", "is_public_content": True},
177
+ {"id": 2, "content": "Recent updates have improved user experience with better content discovery and enhanced privacy controls for community members.", "length": 82, "word_count": 12, "content_type": "platform_updates", "is_public_content": True}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  ],
179
  "url_type": "Facebook Content",
180
  "extraction_time": datetime.now().isoformat(),
 
184
  }
185
 
186
  def _extract_name_from_url(self, url: str) -> str:
 
 
187
  match = re.search(r'facebook\.com/(?:groups/|pages/)?([^/?]+)', url)
188
  if match:
189
  name = match.group(1)
 
190
  name = name.replace('-', ' ').title()
191
  return name
192
  return ""
193
+
194
  def _create_demo_data(self) -> Dict:
 
195
  return {
196
  "groups": {
197
  "gamersofbangladesh2": "Gaming Community Bangladesh",
 
205
  }
206
  }
207
 
208
+ # ------------------ Hugging Face AI Integration ------------------
209
+
210
  def get_embeddings():
211
+ api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
212
+ if not api_key:
213
+ st.error("❌ HuggingFace API Key not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  return None
215
 
216
+ embeddings = HuggingFaceInstructEmbeddings(
217
+ model_name="hkunlp/instructor-mini",
218
+ model_kwargs={"device": "cpu"},
219
+ huggingfacehub_api_token=api_key
220
+ )
221
+ st.success("βœ… HuggingFace Embeddings loaded")
222
+ return embeddings
223
+
224
  def get_llm():
225
+ api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
226
+ if not api_key:
227
+ st.error("❌ HuggingFace API Key not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  return None
229
 
230
+ llm = ChatHuggingFaceHub(
231
+ repo_id="google/flan-t5-large",
232
+ model_kwargs={"temperature":0.7, "max_new_tokens":512},
233
+ huggingfacehub_api_token=api_key
234
+ )
235
+ st.success("βœ… HuggingFace LLM loaded")
236
+ return llm
237
+
238
  def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
 
239
  try:
240
  if not extracted_data:
241
+ return "No data available."
242
 
243
  page_info = extracted_data.get('page_info', {})
244
  content_blocks = extracted_data.get('content_blocks', [])
245
  url_type = extracted_data.get('url_type', 'Facebook Content')
246
  source = extracted_data.get('source', 'demo')
 
247
  user_input_lower = user_input.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
+ if any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
250
+ return f"**πŸ“Š Summary of {page_info.get('title','Facebook Content')}**\nType: {url_type}\nData Source: {source.upper()}\nBlocks: {len(content_blocks)}"
251
+ elif any(word in user_input_lower for word in ['purpose','about','what is']):
252
+ return f"**🎯 Purpose:** {page_info.get('description','Community engagement and content sharing')}"
253
  else:
254
+ return f"**πŸ€– Analysis:** This {url_type.lower()} contains {len(content_blocks)} content blocks."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  except Exception as e:
256
  return f"Analysis error: {str(e)}"
257
 
258
  def process_facebook_data(extracted_data):
 
259
  if not extracted_data or extracted_data.get("status") != "success":
260
  return None, []
261
+
262
+ all_text = ""
263
+ for block in extracted_data["content_blocks"]:
264
+ all_text += block["content"] + "\n\n"
265
+
266
+ splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  chunks = splitter.split_text(all_text)
268
  documents = [Document(page_content=chunk) for chunk in chunks]
269
+
270
+ embeddings = get_embeddings()
271
+ if embeddings is None:
272
+ return "simple", documents
273
+
274
+ vectorstore = FAISS.from_documents(documents, embeddings)
275
+ return vectorstore, documents
276
 
277
  def create_chatbot(vectorstore):
278
+ llm = get_llm()
279
+ if llm is None:
280
+ return "simple"
281
+
282
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")
283
+ chain = ConversationalRetrievalChain.from_llm(
284
+ llm=llm,
285
+ retriever=vectorstore.as_retriever(search_kwargs={"k":3}),
286
+ memory=memory,
287
+ return_source_documents=True,
288
+ output_key="answer"
289
+ )
290
+ return chain
291
+
292
+ # ------------------ Streamlit UI ------------------
 
 
 
 
 
 
 
 
293
 
294
  def main():
295
+ st.title("πŸ“˜ Facebook Data Extractor (Live Hugging Face)")
296
+ st.markdown("**University Project** - Real data when possible, demo data if restricted")
297
 
298
  if st.button("← Back to Main Dashboard"):
299
  st.switch_page("app.py")
300
+
 
301
  if "extractor" not in st.session_state:
302
  st.session_state.extractor = FacebookDataSimulator()
303
  if "facebook_data" not in st.session_state:
 
309
  if "chat_history" not in st.session_state:
310
  st.session_state.chat_history = []
311
  if "processing_mode" not in st.session_state:
312
+ st.session_state.processing_mode = "ai"
313
+
314
  # Sidebar
315
  with st.sidebar:
316
  st.header("βš™οΈ Facebook Configuration")
317
+ data_type = st.selectbox("Content Type", ["group","page","event","post","general"])
318
+ facebook_url = st.text_input("Facebook URL","https://www.facebook.com/groups/gamersofbangladesh2")
319
+ processing_mode = st.radio("Analysis Mode:", ["AI Analysis (Recommended)","Simple Analysis"])
320
+ st.session_state.processing_mode = "ai" if processing_mode=="AI Analysis (Recommended)" else "simple"
321
+
322
+ if st.button("πŸš€ Extract Facebook Data"):
323
+ url_to_use = facebook_url
324
+ if not url_to_use or 'facebook.com' not in url_to_use:
325
+ st.error("❌ Enter a valid Facebook URL")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  else:
327
  with st.spinner("πŸ”„ Analyzing Facebook data..."):
328
  extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
 
329
  if extracted_data.get("status") == "success":
330
  st.session_state.facebook_data = extracted_data
331
+ if st.session_state.processing_mode=="ai":
332
+ vectorstore, _ = process_facebook_data(extracted_data)
333
+ if vectorstore!="simple":
334
+ st.session_state.vectorstore = vectorstore
335
+ st.session_state.chatbot = create_chatbot(vectorstore)
 
 
 
 
336
  else:
337
+ st.warning("⚠️ Using simple analysis")
338
  st.session_state.chatbot = "simple"
 
339
  else:
340
  st.session_state.chatbot = "simple"
341
+ st.success("βœ… Data ready!")
 
 
 
 
 
 
 
342
  else:
343
+ st.error("❌ Extraction failed")
344
+
345
+ # Main columns
346
+ col1, col2 = st.columns([1,1])
 
 
 
 
 
 
 
 
 
 
 
347
  with col1:
348
  st.header("πŸ“Š Extraction Results")
 
349
  if st.session_state.facebook_data:
350
  data = st.session_state.facebook_data
351
+ page_info = data["page_info"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  st.write(f"**Title:** {page_info['title']}")
353
+ st.write(f"**Description:** {page_info.get('description','No description')}")
354
+ st.write(f"**Access:** {page_info.get('access_note','Public')}")
355
+ st.subheader("Content Blocks")
356
+ for i, block in enumerate(data["content_blocks"]):
357
+ st.markdown(f"**Block {i+1}:** {block['content']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
  with col2:
360
+ st.header("πŸ’¬ Ask About This Data")
361
+ if st.session_state.facebook_data:
362
+ user_input = st.text_input("Enter your question")
 
 
 
 
 
 
 
 
 
 
 
 
363
  if user_input:
364
+ if st.session_state.chatbot=="simple":
365
+ answer = simple_chat_analysis(user_input, st.session_state.facebook_data)
366
+ st.markdown(answer)
367
+ else:
368
+ chain = st.session_state.chatbot
369
+ result = chain({"question":user_input})
370
+ st.markdown(result['answer'])
371
+ if result.get("source_documents"):
372
+ st.subheader("πŸ“‘ Source Documents")
373
+ for doc in result["source_documents"]:
374
+ st.markdown(f"- {doc.page_content[:300]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
+ if __name__=="__main__":
377
+ main()