Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

App Files Files Community

Refat81 commited on Oct 21, 2025

Commit

fd2cc7f

verified ·

1 Parent(s): 47ac751

Update pages/facebook_extractor.py

Browse files

Files changed (1) hide show

pages/facebook_extractor.py +121 -541

pages/facebook_extractor.py CHANGED Viewed

@@ -9,14 +9,13 @@ from typing import List, Dict
 import os
 import tempfile
-# Import your existing AI components
-from langchain_text_splitters import CharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from langchain.schema import Document
-from langchain_community.llms import HuggingFaceHub
 st.set_page_config(
     page_title="Facebook Data Extractor",
@@ -35,12 +34,10 @@ class FacebookDataSimulator:
         try:
             st.info(f"🔍 Analyzing: {url}")
-            # Try real extraction first
             real_data = self._try_real_extraction(url)
             if real_data.get("status") == "success":
                 return real_data
-            # If real extraction fails, use demo data
             st.warning("⚠️ Using demo data (Facebook restrictions active)")
             return self._get_demo_data(url, data_type)
@@ -49,29 +46,15 @@ class FacebookDataSimulator:
             return self._get_demo_data(url, data_type)
     def _try_real_extraction(self, url: str) -> Dict:
-        """Try real extraction with better error handling"""
         try:
-            # Use a proxy-like approach with different user agents
             headers = {
-                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-                'Accept-Language': 'en-US,en;q=0.5',
-                'Accept-Encoding': 'gzip, deflate, br',
-                'DNT': '1',
-                'Connection': 'keep-alive',
-                'Upgrade-Insecure-Requests': '1',
             }
-            # Try with shorter timeout
             response = requests.get(url, headers=headers, timeout=10, verify=False)
             if response.status_code == 200:
                 soup = BeautifulSoup(response.text, 'html.parser')
-                # Extract basic info
                 title = soup.find('title')
                 description = soup.find('meta', attrs={'name': 'description'})
                 return {
                     "page_info": {
                         "title": title.text if title else "Facebook Content",
@@ -88,16 +71,13 @@ class FacebookDataSimulator:
                 }
             else:
                 return {"status": "error", "source": "real"}
         except Exception:
             return {"status": "error", "source": "real"}
     def _extract_real_content(self, soup) -> List[Dict]:
-        """Extract content from real page"""
         blocks = []
         text = soup.get_text()
         paragraphs = [p.strip() for p in text.split('.') if p.strip() and len(p.strip()) > 30]
         for i, paragraph in enumerate(paragraphs[:8]):
             blocks.append({
                 "id": i + 1,
@@ -107,13 +87,10 @@ class FacebookDataSimulator:
                 "content_type": "real_content",
                 "is_public_content": True
             })
         return blocks
     def _get_demo_data(self, url: str, data_type: str) -> Dict:
-        """Get realistic demo data based on URL type"""
         url_type = self._analyze_url_type(url)
         if 'group' in url_type.lower():
             return self._get_group_demo_data(url, data_type)
         elif 'page' in url_type.lower():
@@ -122,9 +99,7 @@ class FacebookDataSimulator:
             return self._get_general_demo_data(url, data_type)
     def _analyze_url_type(self, url: str) -> str:
-        """Analyze URL type for realistic demo data"""
         url_lower = url.lower()
         if 'group' in url_lower:
             return "Facebook Group"
         elif 'page' in url_lower or 'facebook.com/' in url_lower and '/pages/' not in url_lower:
@@ -137,9 +112,7 @@ class FacebookDataSimulator:
             return "Facebook Content"
     def _get_group_demo_data(self, url: str, data_type: str) -> Dict:
-        """Get realistic group demo data"""
         group_name = self._extract_name_from_url(url) or "Gaming Community"
         return {
             "page_info": {
                 "title": f"{group_name} | Facebook Group",
@@ -151,46 +124,11 @@ class FacebookDataSimulator:
                 "access_note": "Public group - Limited data due to platform restrictions"
             },
             "content_blocks": [
-                {
-                    "id": 1,
-                    "content": f"Welcome to {group_name}! This is a community for fans and enthusiasts to share their experiences, ask questions, and connect with like-minded people.",
-                    "length": 120,
-                    "word_count": 25,
-                    "content_type": "welcome_message",
-                    "is_public_content": True
-                },
-                {
-                    "id": 2,
-                    "content": "Just shared my latest project in the group! Would love to get some feedback from the community on the new features we're implementing.",
-                    "length": 95,
-                    "word_count": 18,
-                    "content_type": "member_post",
-                    "is_public_content": True
-                },
-                {
-                    "id": 3,
-                    "content": "Does anyone have experience with this issue? I've been trying to solve it for a while and could use some community wisdom.",
-                    "length": 88,
-                    "word_count": 16,
-                    "content_type": "question_post",
-                    "is_public_content": True
-                },
-                {
-                    "id": 4,
-                    "content": "Our monthly meetup is scheduled for next Saturday! Don't forget to RSVP so we can plan accordingly. Looking forward to seeing everyone there.",
-                    "length": 102,
-                    "word_count": 19,
-                    "content_type": "event_announcement",
-                    "is_public_content": True
-                },
-                {
-                    "id": 5,
-                    "content": "The community guidelines: Be respectful, no spam, keep discussions relevant to the group's topic, and help each other grow.",
-                    "length": 78,
-                    "word_count": 14,
-                    "content_type": "community_guidelines",
-                    "is_public_content": True
-                }
             ],
             "url_type": "Facebook Group",
             "extraction_time": datetime.now().isoformat(),
@@ -200,9 +138,7 @@ class FacebookDataSimulator:
         }
     def _get_page_demo_data(self, url: str, data_type: str) -> Dict:
-        """Get realistic page demo data"""
         page_name = self._extract_name_from_url(url) or "Brand Page"
         return {
             "page_info": {
                 "title": f"{page_name} | Facebook Page",
@@ -214,38 +150,10 @@ class FacebookDataSimulator:
                 "access_note": "Public page - Limited data due to platform restrictions"
             },
             "content_blocks": [
-                {
-                    "id": 1,
-                    "content": f"Welcome to the official {page_name} Facebook page! Here you'll find the latest updates, news, and announcements from our team.",
-                    "length": 98,
-                    "word_count": 15,
-                    "content_type": "welcome_message",
-                    "is_public_content": True
-                },
-                {
-                    "id": 2,
-                    "content": "We're excited to announce our new product launch next week! Stay tuned for more details and special offers for our Facebook community.",
-                    "length": 92,
-                    "word_count": 16,
-                    "content_type": "announcement",
-                    "is_public_content": True
-                },
-                {
-                    "id": 3,
-                    "content": "Thank you to everyone who participated in our recent event! The feedback has been incredible and we're already planning the next one.",
-                    "length": 87,
-                    "word_count": 14,
-                    "content_type": "event_followup",
-                    "is_public_content": True
-                },
-                {
-                    "id": 4,
-                    "content": "Customer support hours: Monday-Friday 9AM-6PM. For urgent issues, please message us directly and we'll respond as soon as possible.",
-                    "length": 85,
-                    "word_count": 15,
-                    "content_type": "support_info",
-                    "is_public_content": True
-                }
             ],
             "url_type": "Facebook Page",
             "extraction_time": datetime.now().isoformat(),
@@ -255,7 +163,6 @@ class FacebookDataSimulator:
         }
     def _get_general_demo_data(self, url: str, data_type: str) -> Dict:
-        """Get general demo data"""
         return {
             "page_info": {
                 "title": "Facebook Content",
@@ -266,22 +173,8 @@ class FacebookDataSimulator:
                 "access_note": "Public content - Platform restrictions apply"
             },
             "content_blocks": [
-                {
-                    "id": 1,
-                    "content": "Community engagement and social interactions are key aspects of this platform. Users share content, connect with friends, and participate in discussions.",
-                    "length": 105,
-                    "word_count": 16,
-                    "content_type": "general_content",
-                    "is_public_content": True
-                },
-                {
-                    "id": 2,
-                    "content": "Recent updates have improved user experience with better content discovery and enhanced privacy controls for community members.",
-                    "length": 82,
-                    "word_count": 12,
-                    "content_type": "platform_updates",
-                    "is_public_content": True
-                }
             ],
             "url_type": "Facebook Content",
             "extraction_time": datetime.now().isoformat(),
@@ -291,18 +184,14 @@ class FacebookDataSimulator:
         }
     def _extract_name_from_url(self, url: str) -> str:
-        """Extract name from URL for realistic demo data"""
-        # Extract name from URL for more realistic demo data
         match = re.search(r'facebook\.com/(?:groups/|pages/)?([^/?]+)', url)
         if match:
             name = match.group(1)
-            # Clean up the name
             name = name.replace('-', ' ').title()
             return name
         return ""
     def _create_demo_data(self) -> Dict:
-        """Create comprehensive demo data"""
         return {
             "groups": {
                 "gamersofbangladesh2": "Gaming Community Bangladesh",
@@ -316,252 +205,99 @@ class FacebookDataSimulator:
             }
         }
 def get_embeddings():
-    """Initialize embeddings with better error handling and cache management"""
-    try:
-        # Try multiple embedding models with different cache directories
-        model_options = [
-            "sentence-transformers/all-MiniLM-L6-v2",
-            "sentence-transformers/paraphrase-MiniLM-L3-v2",
-            "sentence-transformers/all-mpnet-base-v2"
-        ]
-        for model_name in model_options:
-            try:
-                st.info(f"🔄 Trying embedding model: {model_name}")
-                # Use temporary directory for cache to avoid permission issues
-                with tempfile.TemporaryDirectory() as temp_cache:
-                    embeddings = HuggingFaceEmbeddings(
-                        model_name=model_name,
-                        cache_folder=temp_cache,
-                        model_kwargs={'device': 'cpu'}
-                    )
-                    # Test the embeddings
-                    test_text = "Hello world"
-                    test_embedding = embeddings.embed_query(test_text)
-                    if test_embedding and len(test_embedding) > 0:
-                        st.success(f"✅ Loaded embeddings: {model_name.split('/')[-1]}")
-                        return embeddings
-            except Exception as e:
-                st.warning(f"⚠️ Failed to load {model_name}: {str(e)}")
-                continue
-        # If all models fail, try without cache
-        st.warning("🔄 Trying fallback embedding method...")
-        try:
-            embeddings = HuggingFaceEmbeddings(
-                model_name="sentence-transformers/all-MiniLM-L6-v2"
-            )
-            st.success("✅ Loaded fallback embeddings")
-            return embeddings
-        except Exception as e:
-            st.error(f"❌ All embedding models failed: {e}")
-            return None
-    except Exception as e:
-        st.error(f"❌ Embeddings error: {e}")
         return None
 def get_llm():
-    """Initialize HuggingFace LLM"""
-    try:
-        api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
-        if not api_key:
-            st.error("HuggingFace API Key not found")
-            return None
-        # Try multiple models
-        model_options = [
-            "mistralai/Mistral-7B-Instruct-v0.1",
-            "google/flan-t5-large",
-            "microsoft/DialoGPT-large"
-        ]
-        for model_id in model_options:
-            try:
-                st.info(f"🔄 Trying LLM: {model_id}")
-                llm = HuggingFaceHub(
-                    repo_id=model_id,
-                    huggingfacehub_api_token=api_key,
-                    model_kwargs={
-                        "temperature": 0.7,
-                        "max_length": 512,
-                        "max_new_tokens": 256,
-                    }
-                )
-                # Test the model
-                test_response = llm.invoke("Hello")
-                if test_response and len(test_response.strip()) > 0:
-                    st.success(f"✅ Loaded LLM: {model_id.split('/')[-1]}")
-                    return llm
-            except Exception as e:
-                st.warning(f"⚠️ Failed to load {model_id}: {str(e)}")
-                continue
-        st.error("❌ All LLMs failed to load")
-        return None
-    except Exception as e:
-        st.error(f"❌ LLM error: {e}")
         return None
 def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
-    """Simple rule-based chat analysis when embeddings fail"""
     try:
         if not extracted_data:
-            return "No data available for analysis."
         page_info = extracted_data.get('page_info', {})
         content_blocks = extracted_data.get('content_blocks', [])
         url_type = extracted_data.get('url_type', 'Facebook Content')
         source = extracted_data.get('source', 'demo')
         user_input_lower = user_input.lower()
-        # Basic analysis based on input
-        if any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
-            return f"""**📊 Summary of {page_info.get('title', 'Facebook Content')}**
-**Type:** {url_type}
-**Data Source:** {source.upper()}
-**Description:** {page_info.get('description', 'No description available')}
-This appears to be a {url_type.lower()} with {len(content_blocks)} content blocks of public information.
-**Key Content Types:**
-{', '.join(set(block['content_type'] for block in content_blocks))}
-The content focuses on community engagement and social interactions."""
-        elif any(word in user_input_lower for word in ['purpose', 'about', 'what is']):
-            return f"""**🎯 Purpose Analysis**
-Based on the extracted data, this {url_type.lower()} appears to be focused on:
-- **Community Building:** {len([b for b in content_blocks if 'community' in b['content_type'].lower()])} community-related posts
-- **Information Sharing:** {len([b for b in content_blocks if 'announcement' in b['content_type'].lower()])} announcements
-- **Member Engagement:** {len([b for b in content_blocks if 'post' in b['content_type'].lower()])} member posts
-**Overall Purpose:** {page_info.get('description', 'Community engagement and content sharing')}"""
-        elif any(word in user_input_lower for word in ['activity', 'engagement', 'active']):
-            active_blocks = len([b for b in content_blocks if any(word in b['content_type'].lower() for word in ['post', 'question', 'event'])])
-            return f"""**📈 Activity Analysis**
-**Content Activity Level:**
-- Total Content Blocks: {len(content_blocks)}
-- Active Engagement Posts: {active_blocks}
-- Informational Posts: {len(content_blocks) - active_blocks}
-The {url_type.lower()} shows a good mix of member engagement and informational content, suggesting an active community."""
         else:
-            return f"""**🤖 Analysis Response**
-I've analyzed the {url_type.lower()} data for you.
-**Your question:** "{user_input}"
-**Content Source:** {source.upper()} data
-**Content Type:** {url_type}
-This {url_type.lower()} contains {len(content_blocks)} pieces of content focusing on community engagement and information sharing.
-**Try asking:**
-- "What is the main purpose of this group/page?"
-- "Summarize the content and activities"
-- "What kind of engagement does this content show?""""
     except Exception as e:
         return f"Analysis error: {str(e)}"
 def process_facebook_data(extracted_data):
-    """Process extracted data for AI analysis with fallbacks"""
     if not extracted_data or extracted_data.get("status") != "success":
         return None, []
-    page_info = extracted_data['page_info']
-    content_blocks = extracted_data['content_blocks']
-    url_type = extracted_data['url_type']
-    source = extracted_data.get('source', 'unknown')
-    all_text = f"FACEBOOK DATA ANALYSIS\n{'='*50}\n\n"
-    all_text += f"📄 PAGE INFORMATION:\n"
-    all_text += f"Title: {page_info['title']}\n"
-    all_text += f"URL Type: {url_type}\n"
-    all_text += f"Data Source: {source.upper()}\n"
-    all_text += f"Access: {page_info.get('access_note', 'Public content')}\n"
-    if page_info.get('member_count'):
-        all_text += f"Members: {page_info['member_count']}\n"
-    elif page_info.get('follower_count'):
-        all_text += f"Followers: {page_info['follower_count']}\n"
-    all_text += f"Extracted: {extracted_data['extraction_time']}\n\n"
-    all_text += f"📊 CONTENT ANALYSIS:\n"
-    all_text += f"Content Blocks: {len(content_blocks)}\n"
-    all_text += f"Public Content: {sum(1 for b in content_blocks if b['is_public_content'])} blocks\n\n"
-    for i, block in enumerate(content_blocks):
-        all_text += f"--- BLOCK {i+1} ---\n"
-        all_text += f"Type: {block['content_type']}\n"
-        all_text += f"Words: {block['word_count']} | Public: {block['is_public_content']}\n"
-        all_text += f"Content: {block['content']}\n\n"
-    all_text += "="*50
-    # Split into chunks
-    splitter = CharacterTextSplitter(
-        separator="\n",
-        chunk_size=1000,
-        chunk_overlap=200,
-        length_function=len
-    )
     chunks = splitter.split_text(all_text)
     documents = [Document(page_content=chunk) for chunk in chunks]
-    return "simple", documents  # Return simple mode instead of vectorstore
 def create_chatbot(vectorstore):
-    """Create conversational chatbot"""
-    try:
-        llm = get_llm()
-        if llm is None:
-            return "simple"  # Return simple mode if LLM fails
-        memory = ConversationBufferMemory(
-            memory_key="chat_history",
-            return_messages=True,
-            output_key="answer"
-        )
-        chain = ConversationalRetrievalChain.from_llm(
-            llm=llm,
-            retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
-            memory=memory,
-            return_source_documents=True,
-            output_key="answer"
-        )
-        return chain
-    except Exception as e:
-        st.error(f"Chatbot creation failed: {str(e)}")
-        return "simple"  # Fallback to simple mode
 def main():
-    st.title("📘 Facebook Data Extractor")
-    st.markdown("**University Project** - Real data when possible, realistic demo data when restricted")
     if st.button("← Back to Main Dashboard"):
         st.switch_page("app.py")
-    # Initialize session state
     if "extractor" not in st.session_state:
         st.session_state.extractor = FacebookDataSimulator()
     if "facebook_data" not in st.session_state:
@@ -573,225 +309,69 @@ def main():
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
     if "processing_mode" not in st.session_state:
-        st.session_state.processing_mode = "ai"  # ai or simple
     # Sidebar
     with st.sidebar:
         st.header("⚙️ Facebook Configuration")
-        data_type = st.selectbox(
-            "Content Type",
-            ["group", "page", "event", "post", "general"],
-            help="Select the type of Facebook content"
-        )
-        facebook_url = st.text_input(
-            "Facebook URL",
-            placeholder="https://www.facebook.com/groups/gamersofbangladesh2",
-            help="Enter any Facebook URL for analysis"
-        )
-        # Processing mode
-        st.subheader("🔧 Processing Mode")
-        processing_mode = st.radio(
-            "Choose analysis mode:",
-            ["AI Analysis (Recommended)", "Simple Analysis"],
-            help="AI Analysis uses embeddings, Simple uses rule-based"
-        )
-        st.session_state.processing_mode = "ai" if processing_mode == "AI Analysis (Recommended)" else "simple"
-        # Quick test URLs
-        st.markdown("### 🚀 Test URLs")
-        test_urls = {
-            "Gaming Group": "https://www.facebook.com/groups/gamersofbangladesh2",
-            "Tech Community": "https://www.facebook.com/groups/programmingcommunity",
-            "Business Page": "https://www.facebook.com/Meta/",
-        }
-        for name, url in test_urls.items():
-            if st.button(f"🔗 {name}", key=f"fb_{name}"):
-                st.session_state.current_fb_url = url
-                st.rerun()
-        if st.button("🚀 Extract Facebook Data", type="primary"):
-            url_to_use = facebook_url or getattr(st.session_state, 'current_fb_url', '')
-            if not url_to_use:
-                st.error("❌ Please enter a Facebook URL")
-            elif 'facebook.com' not in url_to_use:
-                st.error("❌ Please enter a valid Facebook URL")
             else:
                 with st.spinner("🔄 Analyzing Facebook data..."):
                     extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
                     if extracted_data.get("status") == "success":
                         st.session_state.facebook_data = extracted_data
-                        # Process based on selected mode
-                        if st.session_state.processing_mode == "ai":
-                            result = process_facebook_data(extracted_data)
-                            if result and result[0] != "simple":
-                                st.session_state.vectorstore = result[0]
-                                st.session_state.chatbot = create_chatbot(result[0])
-                                st.session_state.chat_history = []
-                                st.success("✅ AI analysis ready!")
                             else:
-                                st.warning("⚠️ Using simple analysis (AI features limited)")
                                 st.session_state.chatbot = "simple"
-                                st.session_state.chat_history = []
                         else:
                             st.session_state.chatbot = "simple"
-                            st.session_state.chat_history = []
-                            st.success("✅ Simple analysis ready!")
-                        source = extracted_data.get('source', 'unknown')
-                        if source == 'demo':
-                            st.warning("📝 Using realistic demo data (Facebook restrictions active)")
-                        else:
-                            st.success("✅ Real data extracted successfully!")
                     else:
-                        error_msg = extracted_data.get("error", "Unknown error")
-                        st.error(f"❌ Extraction failed: {error_msg}")
-        if st.session_state.facebook_data:
-            st.markdown("---")
-            if st.button("🗑️ Clear Data", type="secondary"):
-                st.session_state.facebook_data = None
-                st.session_state.vectorstore = None
-                st.session_state.chatbot = None
-                st.session_state.chat_history = []
-                st.rerun()
-    # Main content
-    col1, col2 = st.columns([1, 1])
     with col1:
         st.header("📊 Extraction Results")
         if st.session_state.facebook_data:
             data = st.session_state.facebook_data
-            page_info = data['page_info']
-            content_blocks = data['content_blocks']
-            source = data.get('source', 'unknown')
-            if source == 'demo':
-                st.warning("📝 **Demo Data** - Realistic simulation (Facebook restrictions)")
-            else:
-                st.success("✅ **Real Data** - Successfully extracted")
-            # Show processing mode
-            if st.session_state.processing_mode == "simple":
-                st.info("🔧 **Simple Analysis Mode** - Rule-based processing")
-            else:
-                st.info("🤖 **AI Analysis Mode** - Embedding-based processing")
-            # Metrics
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Content Blocks", len(content_blocks))
-            with col2:
-                st.metric("Data Source", source.upper())
-            with col3:
-                st.metric("Analysis Mode", "AI" if st.session_state.processing_mode == "ai" else "Simple")
-            # Page info
-            st.subheader("🏷️ Page Information")
             st.write(f"**Title:** {page_info['title']}")
-            st.write(f"**URL Type:** {data['url_type']}")
-            st.write(f"**Description:** {page_info.get('description', 'No description')}")
-            if page_info.get('member_count'):
-                st.write(f"**Members:** {page_info['member_count']}")
-            elif page_info.get('follower_count'):
-                st.write(f"**Followers:** {page_info['follower_count']}")
-            st.write(f"**Access:** {page_info.get('access_note', 'Public content')}")
-            # Content samples
-            st.subheader("📝 Content Analysis")
-            for i, block in enumerate(content_blocks):
-                with st.expander(f"Content {i+1} - {block['content_type']} ({block['word_count']} words)"):
-                    st.write(block['content'])
-                    st.caption(f"Public: {block['is_public_content']}")
-        else:
-            st.info("""
-            ## 📘 Facebook Data Extractor
-            **University Project Feature**
-            **How it works:**
-            1. Enter any Facebook URL
-            2. System tries real data extraction
-            3. If blocked, uses **realistic demo data**
-            4. Choose between AI or Simple analysis
-            **Analysis Modes:**
-            - 🤖 **AI Analysis**: Uses embeddings and Mistral AI
-            - 🔧 **Simple Analysis**: Rule-based (works without embeddings)
-            **Perfect for demonstrating:**
-            - Social media data extraction concepts
-            - AI analysis capabilities
-            - Platform integration
-            - Error handling strategies
-            """)
     with col2:
-        st.header("💬 Analysis Chat")
-        if st.session_state.chatbot and st.session_state.facebook_data:
-            # Display chat history
-            for chat in st.session_state.chat_history:
-                if chat["role"] == "user":
-                    with st.chat_message("user"):
-                        st.write(chat['content'])
-                elif chat["role"] == "assistant":
-                    with st.chat_message("assistant"):
-                        st.write(chat['content'])
-            # Chat input
-            user_input = st.chat_input("Ask about the Facebook data...")
             if user_input:
-                st.session_state.chat_history.append({"role": "user", "content": user_input})
-                with st.spinner("🤔 Analyzing..."):
-                    try:
-                        if st.session_state.chatbot == "simple":
-                            # Use simple analysis
-                            response = simple_chat_analysis(user_input, st.session_state.facebook_data)
-                            st.session_state.chat_history.append({"role": "assistant", "content": response})
-                        else:
-                            # Use AI chatbot
-                            response = st.session_state.chatbot.invoke({"question": user_input})
-                            answer = response.get("answer", "I couldn't generate a response.")
-                            st.session_state.chat_history.append({"role": "assistant", "content": answer})
-                        st.rerun()
-                    except Exception as e:
-                        error_msg = f"Analysis Error: {str(e)}"
-                        st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
-                        st.rerun()
-            # Suggested questions
-            if not st.session_state.chat_history:
-                st.subheader("💡 Try asking:")
-                suggestions = [
-                    "What is this Facebook group/page about?",
-                    "Summarize the main content and purpose",
-                    "What kind of community is this?",
-                    "Analyze the engagement and activity level"
-                ]
-                for suggestion in suggestions:
-                    if st.button(suggestion, key=f"fb_suggest_{suggestion}"):
-                        st.info(f"Type: '{suggestion}' in chat")
-        elif st.session_state.facebook_data:
-            st.info("💬 Start chatting about the Facebook data")
-        else:
-            st.info("🔍 Extract Facebook data to enable analysis")
-if __name__ == "__main__":
-    main()

 import os
 import tempfile
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from langchain.schema import Document
+from langchain.chat_models import ChatHuggingFaceHub
 st.set_page_config(
     page_title="Facebook Data Extractor",
         try:
             st.info(f"🔍 Analyzing: {url}")
             real_data = self._try_real_extraction(url)
             if real_data.get("status") == "success":
                 return real_data
             st.warning("⚠️ Using demo data (Facebook restrictions active)")
             return self._get_demo_data(url, data_type)
             return self._get_demo_data(url, data_type)
     def _try_real_extraction(self, url: str) -> Dict:
         try:
             headers = {
+                'User-Agent': 'Mozilla/5.0',
             }
             response = requests.get(url, headers=headers, timeout=10, verify=False)
             if response.status_code == 200:
                 soup = BeautifulSoup(response.text, 'html.parser')
                 title = soup.find('title')
                 description = soup.find('meta', attrs={'name': 'description'})
                 return {
                     "page_info": {
                         "title": title.text if title else "Facebook Content",
                 }
             else:
                 return {"status": "error", "source": "real"}
         except Exception:
             return {"status": "error", "source": "real"}
     def _extract_real_content(self, soup) -> List[Dict]:
         blocks = []
         text = soup.get_text()
         paragraphs = [p.strip() for p in text.split('.') if p.strip() and len(p.strip()) > 30]
         for i, paragraph in enumerate(paragraphs[:8]):
             blocks.append({
                 "id": i + 1,
                 "content_type": "real_content",
                 "is_public_content": True
             })
         return blocks
     def _get_demo_data(self, url: str, data_type: str) -> Dict:
         url_type = self._analyze_url_type(url)
         if 'group' in url_type.lower():
             return self._get_group_demo_data(url, data_type)
         elif 'page' in url_type.lower():
             return self._get_general_demo_data(url, data_type)
     def _analyze_url_type(self, url: str) -> str:
         url_lower = url.lower()
         if 'group' in url_lower:
             return "Facebook Group"
         elif 'page' in url_lower or 'facebook.com/' in url_lower and '/pages/' not in url_lower:
             return "Facebook Content"
     def _get_group_demo_data(self, url: str, data_type: str) -> Dict:
         group_name = self._extract_name_from_url(url) or "Gaming Community"
         return {
             "page_info": {
                 "title": f"{group_name} | Facebook Group",
                 "access_note": "Public group - Limited data due to platform restrictions"
             },
             "content_blocks": [
+                {"id": 1, "content": f"Welcome to {group_name}! This is a community for fans and enthusiasts to share their experiences, ask questions, and connect with like-minded people.", "length": 120, "word_count": 25, "content_type": "welcome_message", "is_public_content": True},
+                {"id": 2, "content": "Just shared my latest project in the group! Would love to get some feedback from the community on the new features we're implementing.", "length": 95, "word_count": 18, "content_type": "member_post", "is_public_content": True},
+                {"id": 3, "content": "Does anyone have experience with this issue? I've been trying to solve it for a while and could use some community wisdom.", "length": 88, "word_count": 16, "content_type": "question_post", "is_public_content": True},
+                {"id": 4, "content": "Our monthly meetup is scheduled for next Saturday! Don't forget to RSVP so we can plan accordingly. Looking forward to seeing everyone there.", "length": 102, "word_count": 19, "content_type": "event_announcement", "is_public_content": True},
+                {"id": 5, "content": "The community guidelines: Be respectful, no spam, keep discussions relevant to the group's topic, and help each other grow.", "length": 78, "word_count": 14, "content_type": "community_guidelines", "is_public_content": True}
             ],
             "url_type": "Facebook Group",
             "extraction_time": datetime.now().isoformat(),
         }
     def _get_page_demo_data(self, url: str, data_type: str) -> Dict:
         page_name = self._extract_name_from_url(url) or "Brand Page"
         return {
             "page_info": {
                 "title": f"{page_name} | Facebook Page",
                 "access_note": "Public page - Limited data due to platform restrictions"
             },
             "content_blocks": [
+                {"id": 1, "content": f"Welcome to the official {page_name} Facebook page! Here you'll find the latest updates, news, and announcements from our team.", "length": 98, "word_count": 15, "content_type": "welcome_message", "is_public_content": True},
+                {"id": 2, "content": "We're excited to announce our new product launch next week! Stay tuned for more details and special offers for our Facebook community.", "length": 92, "word_count": 16, "content_type": "announcement", "is_public_content": True},
+                {"id": 3, "content": "Thank you to everyone who participated in our recent event! The feedback has been incredible and we're already planning the next one.", "length": 87, "word_count": 14, "content_type": "event_followup", "is_public_content": True},
+                {"id": 4, "content": "Customer support hours: Monday-Friday 9AM-6PM. For urgent issues, please message us directly and we'll respond as soon as possible.", "length": 85, "word_count": 15, "content_type": "support_info", "is_public_content": True}
             ],
             "url_type": "Facebook Page",
             "extraction_time": datetime.now().isoformat(),
         }
     def _get_general_demo_data(self, url: str, data_type: str) -> Dict:
         return {
             "page_info": {
                 "title": "Facebook Content",
                 "access_note": "Public content - Platform restrictions apply"
             },
             "content_blocks": [
+                {"id": 1, "content": "Community engagement and social interactions are key aspects of this platform. Users share content, connect with friends, and participate in discussions.", "length": 105, "word_count": 16, "content_type": "general_content", "is_public_content": True},
+                {"id": 2, "content": "Recent updates have improved user experience with better content discovery and enhanced privacy controls for community members.", "length": 82, "word_count": 12, "content_type": "platform_updates", "is_public_content": True}
             ],
             "url_type": "Facebook Content",
             "extraction_time": datetime.now().isoformat(),
         }
     def _extract_name_from_url(self, url: str) -> str:
         match = re.search(r'facebook\.com/(?:groups/|pages/)?([^/?]+)', url)
         if match:
             name = match.group(1)
             name = name.replace('-', ' ').title()
             return name
         return ""
     def _create_demo_data(self) -> Dict:
         return {
             "groups": {
                 "gamersofbangladesh2": "Gaming Community Bangladesh",
             }
         }
+# ------------------ Hugging Face AI Integration ------------------
 def get_embeddings():
+    api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+    if not api_key:
+        st.error("❌ HuggingFace API Key not found")
         return None
+    embeddings = HuggingFaceInstructEmbeddings(
+        model_name="hkunlp/instructor-mini",
+        model_kwargs={"device": "cpu"},
+        huggingfacehub_api_token=api_key
+    )
+    st.success("✅ HuggingFace Embeddings loaded")
+    return embeddings
 def get_llm():
+    api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+    if not api_key:
+        st.error("❌ HuggingFace API Key not found")
         return None
+    llm = ChatHuggingFaceHub(
+        repo_id="google/flan-t5-large",
+        model_kwargs={"temperature":0.7, "max_new_tokens":512},
+        huggingfacehub_api_token=api_key
+    )
+    st.success("✅ HuggingFace LLM loaded")
+    return llm
 def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
     try:
         if not extracted_data:
+            return "No data available."
         page_info = extracted_data.get('page_info', {})
         content_blocks = extracted_data.get('content_blocks', [])
         url_type = extracted_data.get('url_type', 'Facebook Content')
         source = extracted_data.get('source', 'demo')
         user_input_lower = user_input.lower()
+        if any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
+            return f"**📊 Summary of {page_info.get('title','Facebook Content')}**\nType: {url_type}\nData Source: {source.upper()}\nBlocks: {len(content_blocks)}"
+        elif any(word in user_input_lower for word in ['purpose','about','what is']):
+            return f"**🎯 Purpose:** {page_info.get('description','Community engagement and content sharing')}"
         else:
+            return f"**🤖 Analysis:** This {url_type.lower()} contains {len(content_blocks)} content blocks."
     except Exception as e:
         return f"Analysis error: {str(e)}"
 def process_facebook_data(extracted_data):
     if not extracted_data or extracted_data.get("status") != "success":
         return None, []
+    all_text = ""
+    for block in extracted_data["content_blocks"]:
+        all_text += block["content"] + "\n\n"
+    splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
     chunks = splitter.split_text(all_text)
     documents = [Document(page_content=chunk) for chunk in chunks]
+    embeddings = get_embeddings()
+    if embeddings is None:
+        return "simple", documents
+    vectorstore = FAISS.from_documents(documents, embeddings)
+    return vectorstore, documents
 def create_chatbot(vectorstore):
+    llm = get_llm()
+    if llm is None:
+        return "simple"
+    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")
+    chain = ConversationalRetrievalChain.from_llm(
+        llm=llm,
+        retriever=vectorstore.as_retriever(search_kwargs={"k":3}),
+        memory=memory,
+        return_source_documents=True,
+        output_key="answer"
+    )
+    return chain
+# ------------------ Streamlit UI ------------------
 def main():
+    st.title("📘 Facebook Data Extractor (Live Hugging Face)")
+    st.markdown("**University Project** - Real data when possible, demo data if restricted")
     if st.button("← Back to Main Dashboard"):
         st.switch_page("app.py")
     if "extractor" not in st.session_state:
         st.session_state.extractor = FacebookDataSimulator()
     if "facebook_data" not in st.session_state:
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
     if "processing_mode" not in st.session_state:
+        st.session_state.processing_mode = "ai"
     # Sidebar
     with st.sidebar:
         st.header("⚙️ Facebook Configuration")
+        data_type = st.selectbox("Content Type", ["group","page","event","post","general"])
+        facebook_url = st.text_input("Facebook URL","https://www.facebook.com/groups/gamersofbangladesh2")
+        processing_mode = st.radio("Analysis Mode:", ["AI Analysis (Recommended)","Simple Analysis"])
+        st.session_state.processing_mode = "ai" if processing_mode=="AI Analysis (Recommended)" else "simple"
+        if st.button("🚀 Extract Facebook Data"):
+            url_to_use = facebook_url
+            if not url_to_use or 'facebook.com' not in url_to_use:
+                st.error("❌ Enter a valid Facebook URL")
             else:
                 with st.spinner("🔄 Analyzing Facebook data..."):
                     extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
                     if extracted_data.get("status") == "success":
                         st.session_state.facebook_data = extracted_data
+                        if st.session_state.processing_mode=="ai":
+                            vectorstore, _ = process_facebook_data(extracted_data)
+                            if vectorstore!="simple":
+                                st.session_state.vectorstore = vectorstore
+                                st.session_state.chatbot = create_chatbot(vectorstore)
                             else:
+                                st.warning("⚠️ Using simple analysis")
                                 st.session_state.chatbot = "simple"
                         else:
                             st.session_state.chatbot = "simple"
+                        st.success("✅ Data ready!")
                     else:
+                        st.error("❌ Extraction failed")
+    # Main columns
+    col1, col2 = st.columns([1,1])
     with col1:
         st.header("📊 Extraction Results")
         if st.session_state.facebook_data:
             data = st.session_state.facebook_data
+            page_info = data["page_info"]
             st.write(f"**Title:** {page_info['title']}")
+            st.write(f"**Description:** {page_info.get('description','No description')}")
+            st.write(f"**Access:** {page_info.get('access_note','Public')}")
+            st.subheader("Content Blocks")
+            for i, block in enumerate(data["content_blocks"]):
+                st.markdown(f"**Block {i+1}:** {block['content']}")
     with col2:
+        st.header("💬 Ask About This Data")
+        if st.session_state.facebook_data:
+            user_input = st.text_input("Enter your question")
             if user_input:
+                if st.session_state.chatbot=="simple":
+                    answer = simple_chat_analysis(user_input, st.session_state.facebook_data)
+                    st.markdown(answer)
+                else:
+                    chain = st.session_state.chatbot
+                    result = chain({"question":user_input})
+                    st.markdown(result['answer'])
+                    if result.get("source_documents"):
+                        st.subheader("📑 Source Documents")
+                        for doc in result["source_documents"]:
+                            st.markdown(f"- {doc.page_content[:300]}...")
+if __name__=="__main__":
+    main()