Refat81 commited on
Commit
47ac751
Β·
verified Β·
1 Parent(s): 67a4166

Update pages/facebook_extractor.py

Browse files
Files changed (1) hide show
  1. pages/facebook_extractor.py +210 -58
pages/facebook_extractor.py CHANGED
@@ -7,6 +7,7 @@ import re
7
  from datetime import datetime
8
  from typing import List, Dict
9
  import os
 
10
 
11
  # Import your existing AI components
12
  from langchain_text_splitters import CharacterTextSplitter
@@ -315,16 +316,53 @@ class FacebookDataSimulator:
315
  }
316
  }
317
 
318
- # AI Functions (same as your LinkedIn analyzer)
319
  def get_embeddings():
320
- """Initialize embeddings"""
321
  try:
322
- embeddings = HuggingFaceEmbeddings(
323
- model_name="sentence-transformers/all-MiniLM-L6-v2"
324
- )
325
- return embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  except Exception as e:
327
- st.error(f"Embeddings error: {e}")
328
  return None
329
 
330
  def get_llm():
@@ -335,22 +373,115 @@ def get_llm():
335
  st.error("HuggingFace API Key not found")
336
  return None
337
 
338
- llm = HuggingFaceHub(
339
- repo_id="mistralai/Mistral-7B-Instruct-v0.1",
340
- huggingfacehub_api_token=api_key,
341
- model_kwargs={
342
- "temperature": 0.7,
343
- "max_length": 512,
344
- "max_new_tokens": 256,
345
- }
346
- )
347
- return llm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  except Exception as e:
349
- st.error(f"LLM error: {e}")
350
  return None
351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  def process_facebook_data(extracted_data):
353
- """Process extracted data for AI analysis"""
354
  if not extracted_data or extracted_data.get("status") != "success":
355
  return None, []
356
 
@@ -396,23 +527,14 @@ def process_facebook_data(extracted_data):
396
  chunks = splitter.split_text(all_text)
397
  documents = [Document(page_content=chunk) for chunk in chunks]
398
 
399
- # Create vector store
400
- try:
401
- embeddings = get_embeddings()
402
- if embeddings is None:
403
- return None, []
404
- vectorstore = FAISS.from_documents(documents, embeddings)
405
- return vectorstore, chunks
406
- except Exception as e:
407
- st.error(f"Vector store failed: {e}")
408
- return None, []
409
 
410
  def create_chatbot(vectorstore):
411
  """Create conversational chatbot"""
412
  try:
413
  llm = get_llm()
414
  if llm is None:
415
- return None
416
 
417
  memory = ConversationBufferMemory(
418
  memory_key="chat_history",
@@ -430,7 +552,7 @@ def create_chatbot(vectorstore):
430
  return chain
431
  except Exception as e:
432
  st.error(f"Chatbot creation failed: {str(e)}")
433
- return None
434
 
435
  def main():
436
  st.title("πŸ“˜ Facebook Data Extractor")
@@ -450,6 +572,8 @@ def main():
450
  st.session_state.chatbot = None
451
  if "chat_history" not in st.session_state:
452
  st.session_state.chat_history = []
 
 
453
 
454
  # Sidebar
455
  with st.sidebar:
@@ -467,6 +591,16 @@ def main():
467
  help="Enter any Facebook URL for analysis"
468
  )
469
 
 
 
 
 
 
 
 
 
 
 
470
  # Quick test URLs
471
  st.markdown("### πŸš€ Test URLs")
472
  test_urls = {
@@ -494,20 +628,28 @@ def main():
494
  if extracted_data.get("status") == "success":
495
  st.session_state.facebook_data = extracted_data
496
 
497
- # Process for AI
498
- vectorstore, chunks = process_facebook_data(extracted_data)
499
- if vectorstore:
500
- st.session_state.vectorstore = vectorstore
501
- st.session_state.chatbot = create_chatbot(vectorstore)
502
- st.session_state.chat_history = []
503
-
504
- source = extracted_data.get('source', 'unknown')
505
- if source == 'demo':
506
- st.warning("πŸ“ Using realistic demo data (Facebook restrictions active)")
507
  else:
508
- st.success("βœ… Real data extracted successfully!")
 
 
 
 
 
 
 
 
 
 
509
  else:
510
- st.error("❌ Failed to process data for AI")
511
  else:
512
  error_msg = extracted_data.get("error", "Unknown error")
513
  st.error(f"❌ Extraction failed: {error_msg}")
@@ -538,6 +680,12 @@ def main():
538
  else:
539
  st.success("βœ… **Real Data** - Successfully extracted")
540
 
 
 
 
 
 
 
541
  # Metrics
542
  col1, col2, col3 = st.columns(3)
543
  with col1:
@@ -545,7 +693,7 @@ def main():
545
  with col2:
546
  st.metric("Data Source", source.upper())
547
  with col3:
548
- st.metric("Status", "Success")
549
 
550
  # Page info
551
  st.subheader("🏷️ Page Information")
@@ -577,13 +725,11 @@ def main():
577
  1. Enter any Facebook URL
578
  2. System tries real data extraction
579
  3. If blocked, uses **realistic demo data**
580
- 4. Full AI analysis available
581
 
582
- **Features:**
583
- - Real data extraction when possible
584
- - Realistic demo data when restricted
585
- - Full AI-powered analysis
586
- - Professional interface
587
 
588
  **Perfect for demonstrating:**
589
  - Social media data extraction concepts
@@ -593,7 +739,7 @@ def main():
593
  """)
594
 
595
  with col2:
596
- st.header("πŸ’¬ AI Analysis Chat")
597
 
598
  if st.session_state.chatbot and st.session_state.facebook_data:
599
  # Display chat history
@@ -611,14 +757,20 @@ def main():
611
  if user_input:
612
  st.session_state.chat_history.append({"role": "user", "content": user_input})
613
 
614
- with st.spinner("πŸ€” AI is analyzing..."):
615
  try:
616
- response = st.session_state.chatbot.invoke({"question": user_input})
617
- answer = response.get("answer", "I couldn't generate a response.")
618
- st.session_state.chat_history.append({"role": "assistant", "content": answer})
 
 
 
 
 
 
619
  st.rerun()
620
  except Exception as e:
621
- error_msg = f"AI Error: {str(e)}"
622
  st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
623
  st.rerun()
624
 
@@ -637,9 +789,9 @@ def main():
637
  st.info(f"Type: '{suggestion}' in chat")
638
 
639
  elif st.session_state.facebook_data:
640
- st.info("πŸ’¬ Start chatting with AI about the Facebook data")
641
  else:
642
- st.info("πŸ” Extract Facebook data to enable AI chat")
643
 
644
  if __name__ == "__main__":
645
  main()
 
7
  from datetime import datetime
8
  from typing import List, Dict
9
  import os
10
+ import tempfile
11
 
12
  # Import your existing AI components
13
  from langchain_text_splitters import CharacterTextSplitter
 
316
  }
317
  }
318
 
 
319
  def get_embeddings():
320
+ """Initialize embeddings with better error handling and cache management"""
321
  try:
322
+ # Try multiple embedding models with different cache directories
323
+ model_options = [
324
+ "sentence-transformers/all-MiniLM-L6-v2",
325
+ "sentence-transformers/paraphrase-MiniLM-L3-v2",
326
+ "sentence-transformers/all-mpnet-base-v2"
327
+ ]
328
+
329
+ for model_name in model_options:
330
+ try:
331
+ st.info(f"πŸ”„ Trying embedding model: {model_name}")
332
+
333
+ # Use temporary directory for cache to avoid permission issues
334
+ with tempfile.TemporaryDirectory() as temp_cache:
335
+ embeddings = HuggingFaceEmbeddings(
336
+ model_name=model_name,
337
+ cache_folder=temp_cache,
338
+ model_kwargs={'device': 'cpu'}
339
+ )
340
+
341
+ # Test the embeddings
342
+ test_text = "Hello world"
343
+ test_embedding = embeddings.embed_query(test_text)
344
+ if test_embedding and len(test_embedding) > 0:
345
+ st.success(f"βœ… Loaded embeddings: {model_name.split('/')[-1]}")
346
+ return embeddings
347
+
348
+ except Exception as e:
349
+ st.warning(f"⚠️ Failed to load {model_name}: {str(e)}")
350
+ continue
351
+
352
+ # If all models fail, try without cache
353
+ st.warning("πŸ”„ Trying fallback embedding method...")
354
+ try:
355
+ embeddings = HuggingFaceEmbeddings(
356
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
357
+ )
358
+ st.success("βœ… Loaded fallback embeddings")
359
+ return embeddings
360
+ except Exception as e:
361
+ st.error(f"❌ All embedding models failed: {e}")
362
+ return None
363
+
364
  except Exception as e:
365
+ st.error(f"❌ Embeddings error: {e}")
366
  return None
367
 
368
  def get_llm():
 
373
  st.error("HuggingFace API Key not found")
374
  return None
375
 
376
+ # Try multiple models
377
+ model_options = [
378
+ "mistralai/Mistral-7B-Instruct-v0.1",
379
+ "google/flan-t5-large",
380
+ "microsoft/DialoGPT-large"
381
+ ]
382
+
383
+ for model_id in model_options:
384
+ try:
385
+ st.info(f"πŸ”„ Trying LLM: {model_id}")
386
+
387
+ llm = HuggingFaceHub(
388
+ repo_id=model_id,
389
+ huggingfacehub_api_token=api_key,
390
+ model_kwargs={
391
+ "temperature": 0.7,
392
+ "max_length": 512,
393
+ "max_new_tokens": 256,
394
+ }
395
+ )
396
+
397
+ # Test the model
398
+ test_response = llm.invoke("Hello")
399
+ if test_response and len(test_response.strip()) > 0:
400
+ st.success(f"βœ… Loaded LLM: {model_id.split('/')[-1]}")
401
+ return llm
402
+
403
+ except Exception as e:
404
+ st.warning(f"⚠️ Failed to load {model_id}: {str(e)}")
405
+ continue
406
+
407
+ st.error("❌ All LLMs failed to load")
408
+ return None
409
+
410
  except Exception as e:
411
+ st.error(f"❌ LLM error: {e}")
412
  return None
413
 
414
+ def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
415
+ """Simple rule-based chat analysis when embeddings fail"""
416
+ try:
417
+ if not extracted_data:
418
+ return "No data available for analysis."
419
+
420
+ page_info = extracted_data.get('page_info', {})
421
+ content_blocks = extracted_data.get('content_blocks', [])
422
+ url_type = extracted_data.get('url_type', 'Facebook Content')
423
+ source = extracted_data.get('source', 'demo')
424
+
425
+ user_input_lower = user_input.lower()
426
+
427
+ # Basic analysis based on input
428
+ if any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
429
+ return f"""**πŸ“Š Summary of {page_info.get('title', 'Facebook Content')}**
430
+
431
+ **Type:** {url_type}
432
+ **Data Source:** {source.upper()}
433
+ **Description:** {page_info.get('description', 'No description available')}
434
+
435
+ This appears to be a {url_type.lower()} with {len(content_blocks)} content blocks of public information.
436
+
437
+ **Key Content Types:**
438
+ {', '.join(set(block['content_type'] for block in content_blocks))}
439
+
440
+ The content focuses on community engagement and social interactions."""
441
+
442
+ elif any(word in user_input_lower for word in ['purpose', 'about', 'what is']):
443
+ return f"""**🎯 Purpose Analysis**
444
+
445
+ Based on the extracted data, this {url_type.lower()} appears to be focused on:
446
+
447
+ - **Community Building:** {len([b for b in content_blocks if 'community' in b['content_type'].lower()])} community-related posts
448
+ - **Information Sharing:** {len([b for b in content_blocks if 'announcement' in b['content_type'].lower()])} announcements
449
+ - **Member Engagement:** {len([b for b in content_blocks if 'post' in b['content_type'].lower()])} member posts
450
+
451
+ **Overall Purpose:** {page_info.get('description', 'Community engagement and content sharing')}"""
452
+
453
+ elif any(word in user_input_lower for word in ['activity', 'engagement', 'active']):
454
+ active_blocks = len([b for b in content_blocks if any(word in b['content_type'].lower() for word in ['post', 'question', 'event'])])
455
+ return f"""**πŸ“ˆ Activity Analysis**
456
+
457
+ **Content Activity Level:**
458
+ - Total Content Blocks: {len(content_blocks)}
459
+ - Active Engagement Posts: {active_blocks}
460
+ - Informational Posts: {len(content_blocks) - active_blocks}
461
+
462
+ The {url_type.lower()} shows a good mix of member engagement and informational content, suggesting an active community."""
463
+
464
+ else:
465
+ return f"""**πŸ€– Analysis Response**
466
+
467
+ I've analyzed the {url_type.lower()} data for you.
468
+
469
+ **Your question:** "{user_input}"
470
+ **Content Source:** {source.upper()} data
471
+ **Content Type:** {url_type}
472
+
473
+ This {url_type.lower()} contains {len(content_blocks)} pieces of content focusing on community engagement and information sharing.
474
+
475
+ **Try asking:**
476
+ - "What is the main purpose of this group/page?"
477
+ - "Summarize the content and activities"
478
+ - "What kind of engagement does this content show?""""
479
+
480
+ except Exception as e:
481
+ return f"Analysis error: {str(e)}"
482
+
483
  def process_facebook_data(extracted_data):
484
+ """Process extracted data for AI analysis with fallbacks"""
485
  if not extracted_data or extracted_data.get("status") != "success":
486
  return None, []
487
 
 
527
  chunks = splitter.split_text(all_text)
528
  documents = [Document(page_content=chunk) for chunk in chunks]
529
 
530
+ return "simple", documents # Return simple mode instead of vectorstore
 
 
 
 
 
 
 
 
 
531
 
532
  def create_chatbot(vectorstore):
533
  """Create conversational chatbot"""
534
  try:
535
  llm = get_llm()
536
  if llm is None:
537
+ return "simple" # Return simple mode if LLM fails
538
 
539
  memory = ConversationBufferMemory(
540
  memory_key="chat_history",
 
552
  return chain
553
  except Exception as e:
554
  st.error(f"Chatbot creation failed: {str(e)}")
555
+ return "simple" # Fallback to simple mode
556
 
557
  def main():
558
  st.title("πŸ“˜ Facebook Data Extractor")
 
572
  st.session_state.chatbot = None
573
  if "chat_history" not in st.session_state:
574
  st.session_state.chat_history = []
575
+ if "processing_mode" not in st.session_state:
576
+ st.session_state.processing_mode = "ai" # ai or simple
577
 
578
  # Sidebar
579
  with st.sidebar:
 
591
  help="Enter any Facebook URL for analysis"
592
  )
593
 
594
+ # Processing mode
595
+ st.subheader("πŸ”§ Processing Mode")
596
+ processing_mode = st.radio(
597
+ "Choose analysis mode:",
598
+ ["AI Analysis (Recommended)", "Simple Analysis"],
599
+ help="AI Analysis uses embeddings, Simple uses rule-based"
600
+ )
601
+
602
+ st.session_state.processing_mode = "ai" if processing_mode == "AI Analysis (Recommended)" else "simple"
603
+
604
  # Quick test URLs
605
  st.markdown("### πŸš€ Test URLs")
606
  test_urls = {
 
628
  if extracted_data.get("status") == "success":
629
  st.session_state.facebook_data = extracted_data
630
 
631
+ # Process based on selected mode
632
+ if st.session_state.processing_mode == "ai":
633
+ result = process_facebook_data(extracted_data)
634
+ if result and result[0] != "simple":
635
+ st.session_state.vectorstore = result[0]
636
+ st.session_state.chatbot = create_chatbot(result[0])
637
+ st.session_state.chat_history = []
638
+ st.success("βœ… AI analysis ready!")
 
 
639
  else:
640
+ st.warning("⚠️ Using simple analysis (AI features limited)")
641
+ st.session_state.chatbot = "simple"
642
+ st.session_state.chat_history = []
643
+ else:
644
+ st.session_state.chatbot = "simple"
645
+ st.session_state.chat_history = []
646
+ st.success("βœ… Simple analysis ready!")
647
+
648
+ source = extracted_data.get('source', 'unknown')
649
+ if source == 'demo':
650
+ st.warning("πŸ“ Using realistic demo data (Facebook restrictions active)")
651
  else:
652
+ st.success("βœ… Real data extracted successfully!")
653
  else:
654
  error_msg = extracted_data.get("error", "Unknown error")
655
  st.error(f"❌ Extraction failed: {error_msg}")
 
680
  else:
681
  st.success("βœ… **Real Data** - Successfully extracted")
682
 
683
+ # Show processing mode
684
+ if st.session_state.processing_mode == "simple":
685
+ st.info("πŸ”§ **Simple Analysis Mode** - Rule-based processing")
686
+ else:
687
+ st.info("πŸ€– **AI Analysis Mode** - Embedding-based processing")
688
+
689
  # Metrics
690
  col1, col2, col3 = st.columns(3)
691
  with col1:
 
693
  with col2:
694
  st.metric("Data Source", source.upper())
695
  with col3:
696
+ st.metric("Analysis Mode", "AI" if st.session_state.processing_mode == "ai" else "Simple")
697
 
698
  # Page info
699
  st.subheader("🏷️ Page Information")
 
725
  1. Enter any Facebook URL
726
  2. System tries real data extraction
727
  3. If blocked, uses **realistic demo data**
728
+ 4. Choose between AI or Simple analysis
729
 
730
+ **Analysis Modes:**
731
+ - πŸ€– **AI Analysis**: Uses embeddings and Mistral AI
732
+ - πŸ”§ **Simple Analysis**: Rule-based (works without embeddings)
 
 
733
 
734
  **Perfect for demonstrating:**
735
  - Social media data extraction concepts
 
739
  """)
740
 
741
  with col2:
742
+ st.header("πŸ’¬ Analysis Chat")
743
 
744
  if st.session_state.chatbot and st.session_state.facebook_data:
745
  # Display chat history
 
757
  if user_input:
758
  st.session_state.chat_history.append({"role": "user", "content": user_input})
759
 
760
+ with st.spinner("πŸ€” Analyzing..."):
761
  try:
762
+ if st.session_state.chatbot == "simple":
763
+ # Use simple analysis
764
+ response = simple_chat_analysis(user_input, st.session_state.facebook_data)
765
+ st.session_state.chat_history.append({"role": "assistant", "content": response})
766
+ else:
767
+ # Use AI chatbot
768
+ response = st.session_state.chatbot.invoke({"question": user_input})
769
+ answer = response.get("answer", "I couldn't generate a response.")
770
+ st.session_state.chat_history.append({"role": "assistant", "content": answer})
771
  st.rerun()
772
  except Exception as e:
773
+ error_msg = f"Analysis Error: {str(e)}"
774
  st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
775
  st.rerun()
776
 
 
789
  st.info(f"Type: '{suggestion}' in chat")
790
 
791
  elif st.session_state.facebook_data:
792
+ st.info("πŸ’¬ Start chatting about the Facebook data")
793
  else:
794
+ st.info("πŸ” Extract Facebook data to enable analysis")
795
 
796
  if __name__ == "__main__":
797
  main()