Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

App Files Files Community

Refat81 commited on Oct 21, 2025

Commit

d395d4e

verified ·

1 Parent(s): fd2cc7f

Update pages/facebook_extractor.py

Browse files

Files changed (1) hide show

pages/facebook_extractor.py +559 -121

pages/facebook_extractor.py CHANGED Viewed

@@ -9,13 +9,14 @@ from typing import List, Dict
 import os
 import tempfile
-from langchain.text_splitter import CharacterTextSplitter
-from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from langchain.schema import Document
-from langchain.chat_models import ChatHuggingFaceHub
 st.set_page_config(
     page_title="Facebook Data Extractor",
@@ -34,10 +35,12 @@ class FacebookDataSimulator:
         try:
             st.info(f"🔍 Analyzing: {url}")
             real_data = self._try_real_extraction(url)
             if real_data.get("status") == "success":
                 return real_data
             st.warning("⚠️ Using demo data (Facebook restrictions active)")
             return self._get_demo_data(url, data_type)
@@ -46,15 +49,29 @@ class FacebookDataSimulator:
             return self._get_demo_data(url, data_type)
     def _try_real_extraction(self, url: str) -> Dict:
         try:
             headers = {
-                'User-Agent': 'Mozilla/5.0',
             }
             response = requests.get(url, headers=headers, timeout=10, verify=False)
             if response.status_code == 200:
                 soup = BeautifulSoup(response.text, 'html.parser')
                 title = soup.find('title')
                 description = soup.find('meta', attrs={'name': 'description'})
                 return {
                     "page_info": {
                         "title": title.text if title else "Facebook Content",
@@ -71,13 +88,16 @@ class FacebookDataSimulator:
                 }
             else:
                 return {"status": "error", "source": "real"}
         except Exception:
             return {"status": "error", "source": "real"}
     def _extract_real_content(self, soup) -> List[Dict]:
         blocks = []
         text = soup.get_text()
         paragraphs = [p.strip() for p in text.split('.') if p.strip() and len(p.strip()) > 30]
         for i, paragraph in enumerate(paragraphs[:8]):
             blocks.append({
                 "id": i + 1,
@@ -87,10 +107,13 @@ class FacebookDataSimulator:
                 "content_type": "real_content",
                 "is_public_content": True
             })
         return blocks
     def _get_demo_data(self, url: str, data_type: str) -> Dict:
         url_type = self._analyze_url_type(url)
         if 'group' in url_type.lower():
             return self._get_group_demo_data(url, data_type)
         elif 'page' in url_type.lower():
@@ -99,7 +122,9 @@ class FacebookDataSimulator:
             return self._get_general_demo_data(url, data_type)
     def _analyze_url_type(self, url: str) -> str:
         url_lower = url.lower()
         if 'group' in url_lower:
             return "Facebook Group"
         elif 'page' in url_lower or 'facebook.com/' in url_lower and '/pages/' not in url_lower:
@@ -112,7 +137,9 @@ class FacebookDataSimulator:
             return "Facebook Content"
     def _get_group_demo_data(self, url: str, data_type: str) -> Dict:
         group_name = self._extract_name_from_url(url) or "Gaming Community"
         return {
             "page_info": {
                 "title": f"{group_name} | Facebook Group",
@@ -124,11 +151,46 @@ class FacebookDataSimulator:
                 "access_note": "Public group - Limited data due to platform restrictions"
             },
             "content_blocks": [
-                {"id": 1, "content": f"Welcome to {group_name}! This is a community for fans and enthusiasts to share their experiences, ask questions, and connect with like-minded people.", "length": 120, "word_count": 25, "content_type": "welcome_message", "is_public_content": True},
-                {"id": 2, "content": "Just shared my latest project in the group! Would love to get some feedback from the community on the new features we're implementing.", "length": 95, "word_count": 18, "content_type": "member_post", "is_public_content": True},
-                {"id": 3, "content": "Does anyone have experience with this issue? I've been trying to solve it for a while and could use some community wisdom.", "length": 88, "word_count": 16, "content_type": "question_post", "is_public_content": True},
-                {"id": 4, "content": "Our monthly meetup is scheduled for next Saturday! Don't forget to RSVP so we can plan accordingly. Looking forward to seeing everyone there.", "length": 102, "word_count": 19, "content_type": "event_announcement", "is_public_content": True},
-                {"id": 5, "content": "The community guidelines: Be respectful, no spam, keep discussions relevant to the group's topic, and help each other grow.", "length": 78, "word_count": 14, "content_type": "community_guidelines", "is_public_content": True}
             ],
             "url_type": "Facebook Group",
             "extraction_time": datetime.now().isoformat(),
@@ -138,7 +200,9 @@ class FacebookDataSimulator:
         }
     def _get_page_demo_data(self, url: str, data_type: str) -> Dict:
         page_name = self._extract_name_from_url(url) or "Brand Page"
         return {
             "page_info": {
                 "title": f"{page_name} | Facebook Page",
@@ -150,10 +214,38 @@ class FacebookDataSimulator:
                 "access_note": "Public page - Limited data due to platform restrictions"
             },
             "content_blocks": [
-                {"id": 1, "content": f"Welcome to the official {page_name} Facebook page! Here you'll find the latest updates, news, and announcements from our team.", "length": 98, "word_count": 15, "content_type": "welcome_message", "is_public_content": True},
-                {"id": 2, "content": "We're excited to announce our new product launch next week! Stay tuned for more details and special offers for our Facebook community.", "length": 92, "word_count": 16, "content_type": "announcement", "is_public_content": True},
-                {"id": 3, "content": "Thank you to everyone who participated in our recent event! The feedback has been incredible and we're already planning the next one.", "length": 87, "word_count": 14, "content_type": "event_followup", "is_public_content": True},
-                {"id": 4, "content": "Customer support hours: Monday-Friday 9AM-6PM. For urgent issues, please message us directly and we'll respond as soon as possible.", "length": 85, "word_count": 15, "content_type": "support_info", "is_public_content": True}
             ],
             "url_type": "Facebook Page",
             "extraction_time": datetime.now().isoformat(),
@@ -163,6 +255,7 @@ class FacebookDataSimulator:
         }
     def _get_general_demo_data(self, url: str, data_type: str) -> Dict:
         return {
             "page_info": {
                 "title": "Facebook Content",
@@ -173,8 +266,22 @@ class FacebookDataSimulator:
                 "access_note": "Public content - Platform restrictions apply"
             },
             "content_blocks": [
-                {"id": 1, "content": "Community engagement and social interactions are key aspects of this platform. Users share content, connect with friends, and participate in discussions.", "length": 105, "word_count": 16, "content_type": "general_content", "is_public_content": True},
-                {"id": 2, "content": "Recent updates have improved user experience with better content discovery and enhanced privacy controls for community members.", "length": 82, "word_count": 12, "content_type": "platform_updates", "is_public_content": True}
             ],
             "url_type": "Facebook Content",
             "extraction_time": datetime.now().isoformat(),
@@ -184,14 +291,18 @@ class FacebookDataSimulator:
         }
     def _extract_name_from_url(self, url: str) -> str:
         match = re.search(r'facebook\.com/(?:groups/|pages/)?([^/?]+)', url)
         if match:
             name = match.group(1)
             name = name.replace('-', ' ').title()
             return name
         return ""
     def _create_demo_data(self) -> Dict:
         return {
             "groups": {
                 "gamersofbangladesh2": "Gaming Community Bangladesh",
@@ -205,99 +316,270 @@ class FacebookDataSimulator:
             }
         }
-# ------------------ Hugging Face AI Integration ------------------
 def get_embeddings():
-    api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
-    if not api_key:
-        st.error("❌ HuggingFace API Key not found")
         return None
-    embeddings = HuggingFaceInstructEmbeddings(
-        model_name="hkunlp/instructor-mini",
-        model_kwargs={"device": "cpu"},
-        huggingfacehub_api_token=api_key
-    )
-    st.success("✅ HuggingFace Embeddings loaded")
-    return embeddings
 def get_llm():
-    api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
-    if not api_key:
-        st.error("❌ HuggingFace API Key not found")
         return None
-    llm = ChatHuggingFaceHub(
-        repo_id="google/flan-t5-large",
-        model_kwargs={"temperature":0.7, "max_new_tokens":512},
-        huggingfacehub_api_token=api_key
-    )
-    st.success("✅ HuggingFace LLM loaded")
-    return llm
 def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
     try:
         if not extracted_data:
-            return "No data available."
         page_info = extracted_data.get('page_info', {})
         content_blocks = extracted_data.get('content_blocks', [])
         url_type = extracted_data.get('url_type', 'Facebook Content')
         source = extracted_data.get('source', 'demo')
         user_input_lower = user_input.lower()
         if any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
-            return f"**📊 Summary of {page_info.get('title','Facebook Content')}**\nType: {url_type}\nData Source: {source.upper()}\nBlocks: {len(content_blocks)}"
-        elif any(word in user_input_lower for word in ['purpose','about','what is']):
-            return f"**🎯 Purpose:** {page_info.get('description','Community engagement and content sharing')}"
         else:
-            return f"**🤖 Analysis:** This {url_type.lower()} contains {len(content_blocks)} content blocks."
     except Exception as e:
         return f"Analysis error: {str(e)}"
 def process_facebook_data(extracted_data):
     if not extracted_data or extracted_data.get("status") != "success":
         return None, []
-    all_text = ""
-    for block in extracted_data["content_blocks"]:
-        all_text += block["content"] + "\n\n"
-    splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
     chunks = splitter.split_text(all_text)
     documents = [Document(page_content=chunk) for chunk in chunks]
-    embeddings = get_embeddings()
-    if embeddings is None:
-        return "simple", documents
-    vectorstore = FAISS.from_documents(documents, embeddings)
-    return vectorstore, documents
 def create_chatbot(vectorstore):
-    llm = get_llm()
-    if llm is None:
-        return "simple"
-    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")
-    chain = ConversationalRetrievalChain.from_llm(
-        llm=llm,
-        retriever=vectorstore.as_retriever(search_kwargs={"k":3}),
-        memory=memory,
-        return_source_documents=True,
-        output_key="answer"
-    )
-    return chain
-# ------------------ Streamlit UI ------------------
 def main():
-    st.title("📘 Facebook Data Extractor (Live Hugging Face)")
-    st.markdown("**University Project** - Real data when possible, demo data if restricted")
     if st.button("← Back to Main Dashboard"):
         st.switch_page("app.py")
     if "extractor" not in st.session_state:
         st.session_state.extractor = FacebookDataSimulator()
     if "facebook_data" not in st.session_state:
@@ -309,69 +591,225 @@ def main():
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
     if "processing_mode" not in st.session_state:
-        st.session_state.processing_mode = "ai"
     # Sidebar
     with st.sidebar:
         st.header("⚙️ Facebook Configuration")
-        data_type = st.selectbox("Content Type", ["group","page","event","post","general"])
-        facebook_url = st.text_input("Facebook URL","https://www.facebook.com/groups/gamersofbangladesh2")
-        processing_mode = st.radio("Analysis Mode:", ["AI Analysis (Recommended)","Simple Analysis"])
-        st.session_state.processing_mode = "ai" if processing_mode=="AI Analysis (Recommended)" else "simple"
-        if st.button("🚀 Extract Facebook Data"):
-            url_to_use = facebook_url
-            if not url_to_use or 'facebook.com' not in url_to_use:
-                st.error("❌ Enter a valid Facebook URL")
             else:
                 with st.spinner("🔄 Analyzing Facebook data..."):
                     extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
                     if extracted_data.get("status") == "success":
                         st.session_state.facebook_data = extracted_data
-                        if st.session_state.processing_mode=="ai":
-                            vectorstore, _ = process_facebook_data(extracted_data)
-                            if vectorstore!="simple":
-                                st.session_state.vectorstore = vectorstore
-                                st.session_state.chatbot = create_chatbot(vectorstore)
                             else:
-                                st.warning("⚠️ Using simple analysis")
                                 st.session_state.chatbot = "simple"
                         else:
                             st.session_state.chatbot = "simple"
-                        st.success("✅ Data ready!")
                     else:
-                        st.error("❌ Extraction failed")
-    # Main columns
-    col1, col2 = st.columns([1,1])
     with col1:
         st.header("📊 Extraction Results")
         if st.session_state.facebook_data:
             data = st.session_state.facebook_data
-            page_info = data["page_info"]
             st.write(f"**Title:** {page_info['title']}")
-            st.write(f"**Description:** {page_info.get('description','No description')}")
-            st.write(f"**Access:** {page_info.get('access_note','Public')}")
-            st.subheader("Content Blocks")
-            for i, block in enumerate(data["content_blocks"]):
-                st.markdown(f"**Block {i+1}:** {block['content']}")
     with col2:
-        st.header("💬 Ask About This Data")
-        if st.session_state.facebook_data:
-            user_input = st.text_input("Enter your question")
             if user_input:
-                if st.session_state.chatbot=="simple":
-                    answer = simple_chat_analysis(user_input, st.session_state.facebook_data)
-                    st.markdown(answer)
-                else:
-                    chain = st.session_state.chatbot
-                    result = chain({"question":user_input})
-                    st.markdown(result['answer'])
-                    if result.get("source_documents"):
-                        st.subheader("📑 Source Documents")
-                        for doc in result["source_documents"]:
-                            st.markdown(f"- {doc.page_content[:300]}...")
-if __name__=="__main__":
-    main()

 import os
 import tempfile
+# Import your existing AI components
+from langchain_text_splitters import CharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from langchain.schema import Document
+from langchain_community.llms import HuggingFaceHub
 st.set_page_config(
     page_title="Facebook Data Extractor",
         try:
             st.info(f"🔍 Analyzing: {url}")
+            # Try real extraction first
             real_data = self._try_real_extraction(url)
             if real_data.get("status") == "success":
                 return real_data
+            # If real extraction fails, use demo data
             st.warning("⚠️ Using demo data (Facebook restrictions active)")
             return self._get_demo_data(url, data_type)
             return self._get_demo_data(url, data_type)
     def _try_real_extraction(self, url: str) -> Dict:
+        """Try real extraction with better error handling"""
         try:
+            # Use a proxy-like approach with different user agents
             headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.5',
+                'Accept-Encoding': 'gzip, deflate, br',
+                'DNT': '1',
+                'Connection': 'keep-alive',
+                'Upgrade-Insecure-Requests': '1',
             }
+            # Try with shorter timeout
             response = requests.get(url, headers=headers, timeout=10, verify=False)
             if response.status_code == 200:
                 soup = BeautifulSoup(response.text, 'html.parser')
+                # Extract basic info
                 title = soup.find('title')
                 description = soup.find('meta', attrs={'name': 'description'})
                 return {
                     "page_info": {
                         "title": title.text if title else "Facebook Content",
                 }
             else:
                 return {"status": "error", "source": "real"}
         except Exception:
             return {"status": "error", "source": "real"}
     def _extract_real_content(self, soup) -> List[Dict]:
+        """Extract content from real page"""
         blocks = []
         text = soup.get_text()
         paragraphs = [p.strip() for p in text.split('.') if p.strip() and len(p.strip()) > 30]
         for i, paragraph in enumerate(paragraphs[:8]):
             blocks.append({
                 "id": i + 1,
                 "content_type": "real_content",
                 "is_public_content": True
             })
         return blocks
     def _get_demo_data(self, url: str, data_type: str) -> Dict:
+        """Get realistic demo data based on URL type"""
         url_type = self._analyze_url_type(url)
         if 'group' in url_type.lower():
             return self._get_group_demo_data(url, data_type)
         elif 'page' in url_type.lower():
             return self._get_general_demo_data(url, data_type)
     def _analyze_url_type(self, url: str) -> str:
+        """Analyze URL type for realistic demo data"""
         url_lower = url.lower()
         if 'group' in url_lower:
             return "Facebook Group"
         elif 'page' in url_lower or 'facebook.com/' in url_lower and '/pages/' not in url_lower:
             return "Facebook Content"
     def _get_group_demo_data(self, url: str, data_type: str) -> Dict:
+        """Get realistic group demo data"""
         group_name = self._extract_name_from_url(url) or "Gaming Community"
         return {
             "page_info": {
                 "title": f"{group_name} | Facebook Group",
                 "access_note": "Public group - Limited data due to platform restrictions"
             },
             "content_blocks": [
+                {
+                    "id": 1,
+                    "content": f"Welcome to {group_name}! This is a community for fans and enthusiasts to share their experiences, ask questions, and connect with like-minded people.",
+                    "length": 120,
+                    "word_count": 25,
+                    "content_type": "welcome_message",
+                    "is_public_content": True
+                },
+                {
+                    "id": 2,
+                    "content": "Just shared my latest project in the group! Would love to get some feedback from the community on the new features we're implementing.",
+                    "length": 95,
+                    "word_count": 18,
+                    "content_type": "member_post",
+                    "is_public_content": True
+                },
+                {
+                    "id": 3,
+                    "content": "Does anyone have experience with this issue? I've been trying to solve it for a while and could use some community wisdom.",
+                    "length": 88,
+                    "word_count": 16,
+                    "content_type": "question_post",
+                    "is_public_content": True
+                },
+                {
+                    "id": 4,
+                    "content": "Our monthly meetup is scheduled for next Saturday! Don't forget to RSVP so we can plan accordingly. Looking forward to seeing everyone there.",
+                    "length": 102,
+                    "word_count": 19,
+                    "content_type": "event_announcement",
+                    "is_public_content": True
+                },
+                {
+                    "id": 5,
+                    "content": "The community guidelines: Be respectful, no spam, keep discussions relevant to the group's topic, and help each other grow.",
+                    "length": 78,
+                    "word_count": 14,
+                    "content_type": "community_guidelines",
+                    "is_public_content": True
+                }
             ],
             "url_type": "Facebook Group",
             "extraction_time": datetime.now().isoformat(),
         }
     def _get_page_demo_data(self, url: str, data_type: str) -> Dict:
+        """Get realistic page demo data"""
         page_name = self._extract_name_from_url(url) or "Brand Page"
         return {
             "page_info": {
                 "title": f"{page_name} | Facebook Page",
                 "access_note": "Public page - Limited data due to platform restrictions"
             },
             "content_blocks": [
+                {
+                    "id": 1,
+                    "content": f"Welcome to the official {page_name} Facebook page! Here you'll find the latest updates, news, and announcements from our team.",
+                    "length": 98,
+                    "word_count": 15,
+                    "content_type": "welcome_message",
+                    "is_public_content": True
+                },
+                {
+                    "id": 2,
+                    "content": "We're excited to announce our new product launch next week! Stay tuned for more details and special offers for our Facebook community.",
+                    "length": 92,
+                    "word_count": 16,
+                    "content_type": "announcement",
+                    "is_public_content": True
+                },
+                {
+                    "id": 3,
+                    "content": "Thank you to everyone who participated in our recent event! The feedback has been incredible and we're already planning the next one.",
+                    "length": 87,
+                    "word_count": 14,
+                    "content_type": "event_followup",
+                    "is_public_content": True
+                },
+                {
+                    "id": 4,
+                    "content": "Customer support hours: Monday-Friday 9AM-6PM. For urgent issues, please message us directly and we'll respond as soon as possible.",
+                    "length": 85,
+                    "word_count": 15,
+                    "content_type": "support_info",
+                    "is_public_content": True
+                }
             ],
             "url_type": "Facebook Page",
             "extraction_time": datetime.now().isoformat(),
         }
     def _get_general_demo_data(self, url: str, data_type: str) -> Dict:
+        """Get general demo data"""
         return {
             "page_info": {
                 "title": "Facebook Content",
                 "access_note": "Public content - Platform restrictions apply"
             },
             "content_blocks": [
+                {
+                    "id": 1,
+                    "content": "Community engagement and social interactions are key aspects of this platform. Users share content, connect with friends, and participate in discussions.",
+                    "length": 105,
+                    "word_count": 16,
+                    "content_type": "general_content",
+                    "is_public_content": True
+                },
+                {
+                    "id": 2,
+                    "content": "Recent updates have improved user experience with better content discovery and enhanced privacy controls for community members.",
+                    "length": 82,
+                    "word_count": 12,
+                    "content_type": "platform_updates",
+                    "is_public_content": True
+                }
             ],
             "url_type": "Facebook Content",
             "extraction_time": datetime.now().isoformat(),
         }
     def _extract_name_from_url(self, url: str) -> str:
+        """Extract name from URL for realistic demo data"""
+        # Extract name from URL for more realistic demo data
         match = re.search(r'facebook\.com/(?:groups/|pages/)?([^/?]+)', url)
         if match:
             name = match.group(1)
+            # Clean up the name
             name = name.replace('-', ' ').title()
             return name
         return ""
     def _create_demo_data(self) -> Dict:
+        """Create comprehensive demo data"""
         return {
             "groups": {
                 "gamersofbangladesh2": "Gaming Community Bangladesh",
             }
         }
 def get_embeddings():
+    """Initialize embeddings with better error handling and cache management"""
+    try:
+        # Try multiple embedding models with different cache directories
+        model_options = [
+            "sentence-transformers/all-MiniLM-L6-v2",
+            "sentence-transformers/paraphrase-MiniLM-L3-v2",
+            "sentence-transformers/all-mpnet-base-v2"
+        ]
+        for model_name in model_options:
+            try:
+                st.info(f"🔄 Trying embedding model: {model_name}")
+                # Use temporary directory for cache to avoid permission issues
+                with tempfile.TemporaryDirectory() as temp_cache:
+                    embeddings = HuggingFaceEmbeddings(
+                        model_name=model_name,
+                        cache_folder=temp_cache,
+                        model_kwargs={'device': 'cpu'}
+                    )
+                    # Test the embeddings
+                    test_text = "Hello world"
+                    test_embedding = embeddings.embed_query(test_text)
+                    if test_embedding and len(test_embedding) > 0:
+                        st.success(f"✅ Loaded embeddings: {model_name.split('/')[-1]}")
+                        return embeddings
+            except Exception as e:
+                st.warning(f"⚠️ Failed to load {model_name}: {str(e)}")
+                continue
+        # If all models fail, try without cache
+        st.warning("🔄 Trying fallback embedding method...")
+        try:
+            embeddings = HuggingFaceEmbeddings(
+                model_name="sentence-transformers/all-MiniLM-L6-v2"
+            )
+            st.success("✅ Loaded fallback embeddings")
+            return embeddings
+        except Exception as e:
+            st.error(f"❌ All embedding models failed: {e}")
+            return None
+    except Exception as e:
+        st.error(f"❌ Embeddings error: {e}")
         return None
 def get_llm():
+    """Initialize HuggingFace LLM"""
+    try:
+        api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+        if not api_key:
+            st.error("HuggingFace API Key not found")
+            return None
+        # Try multiple models
+        model_options = [
+            "mistralai/Mistral-7B-Instruct-v0.1",
+            "google/flan-t5-large",
+            "microsoft/DialoGPT-large"
+        ]
+        for model_id in model_options:
+            try:
+                st.info(f"🔄 Trying LLM: {model_id}")
+                llm = HuggingFaceHub(
+                    repo_id=model_id,
+                    huggingfacehub_api_token=api_key,
+                    model_kwargs={
+                        "temperature": 0.7,
+                        "max_length": 512,
+                        "max_new_tokens": 256,
+                    }
+                )
+                # Test the model
+                test_response = llm.invoke("Hello")
+                if test_response and len(test_response.strip()) > 0:
+                    st.success(f"✅ Loaded LLM: {model_id.split('/')[-1]}")
+                    return llm
+            except Exception as e:
+                st.warning(f"⚠️ Failed to load {model_id}: {str(e)}")
+                continue
+        st.error("❌ All LLMs failed to load")
+        return None
+    except Exception as e:
+        st.error(f"❌ LLM error: {e}")
         return None
 def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
+    """Simple rule-based chat analysis when embeddings fail"""
     try:
         if not extracted_data:
+            return "No data available for analysis."
         page_info = extracted_data.get('page_info', {})
         content_blocks = extracted_data.get('content_blocks', [])
         url_type = extracted_data.get('url_type', 'Facebook Content')
         source = extracted_data.get('source', 'demo')
         user_input_lower = user_input.lower()
+        # Basic analysis based on input
         if any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
+            response_lines = [
+                f"**📊 Summary of {page_info.get('title', 'Facebook Content')}**",
+                "",
+                f"**Type:** {url_type}",
+                f"**Data Source:** {source.upper()}",
+                f"**Description:** {page_info.get('description', 'No description available')}",
+                "",
+                f"This appears to be a {url_type.lower()} with {len(content_blocks)} content blocks of public information.",
+                "",
+                "**Key Content Types:**",
+                f"{', '.join(set(block['content_type'] for block in content_blocks))}",
+                "",
+                "The content focuses on community engagement and social interactions."
+            ]
+            return "\n".join(response_lines)
+        elif any(word in user_input_lower for word in ['purpose', 'about', 'what is']):
+            community_posts = len([b for b in content_blocks if 'community' in b['content_type'].lower()])
+            announcement_posts = len([b for b in content_blocks if 'announcement' in b['content_type'].lower()])
+            member_posts = len([b for b in content_blocks if 'post' in b['content_type'].lower()])
+            response_lines = [
+                "**🎯 Purpose Analysis**",
+                "",
+                f"Based on the extracted data, this {url_type.lower()} appears to be focused on:",
+                "",
+                f"- **Community Building:** {community_posts} community-related posts",
+                f"- **Information Sharing:** {announcement_posts} announcements",
+                f"- **Member Engagement:** {member_posts} member posts",
+                "",
+                f"**Overall Purpose:** {page_info.get('description', 'Community engagement and content sharing')}"
+            ]
+            return "\n".join(response_lines)
+        elif any(word in user_input_lower for word in ['activity', 'engagement', 'active']):
+            active_blocks = len([b for b in content_blocks if any(word in b['content_type'].lower() for word in ['post', 'question', 'event'])])
+            info_blocks = len(content_blocks) - active_blocks
+            response_lines = [
+                "**📈 Activity Analysis**",
+                "",
+                "**Content Activity Level:**",
+                f"- Total Content Blocks: {len(content_blocks)}",
+                f"- Active Engagement Posts: {active_blocks}",
+                f"- Informational Posts: {info_blocks}",
+                "",
+                f"The {url_type.lower()} shows a good mix of member engagement and informational content, suggesting an active community."
+            ]
+            return "\n".join(response_lines)
         else:
+            response_lines = [
+                "**🤖 Analysis Response**",
+                "",
+                f"I've analyzed the {url_type.lower()} data for you.",
+                "",
+                f"**Your question:** \"{user_input}\"",
+                f"**Content Source:** {source.upper()} data",
+                f"**Content Type:** {url_type}",
+                "",
+                f"This {url_type.lower()} contains {len(content_blocks)} pieces of content focusing on community engagement and information sharing.",
+                "",
+                "**Try asking:**",
+                "- \"What is the main purpose of this group/page?\"",
+                "- \"Summarize the content and activities\"",
+                "- \"What kind of engagement does this content show?\""
+            ]
+            return "\n".join(response_lines)
     except Exception as e:
         return f"Analysis error: {str(e)}"
 def process_facebook_data(extracted_data):
+    """Process extracted data for AI analysis with fallbacks"""
     if not extracted_data or extracted_data.get("status") != "success":
         return None, []
+    page_info = extracted_data['page_info']
+    content_blocks = extracted_data['content_blocks']
+    url_type = extracted_data['url_type']
+    source = extracted_data.get('source', 'unknown')
+    all_text = f"FACEBOOK DATA ANALYSIS\n{'='*50}\n\n"
+    all_text += f"📄 PAGE INFORMATION:\n"
+    all_text += f"Title: {page_info['title']}\n"
+    all_text += f"URL Type: {url_type}\n"
+    all_text += f"Data Source: {source.upper()}\n"
+    all_text += f"Access: {page_info.get('access_note', 'Public content')}\n"
+    if page_info.get('member_count'):
+        all_text += f"Members: {page_info['member_count']}\n"
+    elif page_info.get('follower_count'):
+        all_text += f"Followers: {page_info['follower_count']}\n"
+    all_text += f"Extracted: {extracted_data['extraction_time']}\n\n"
+    all_text += f"📊 CONTENT ANALYSIS:\n"
+    all_text += f"Content Blocks: {len(content_blocks)}\n"
+    all_text += f"Public Content: {sum(1 for b in content_blocks if b['is_public_content'])} blocks\n\n"
+    for i, block in enumerate(content_blocks):
+        all_text += f"--- BLOCK {i+1} ---\n"
+        all_text += f"Type: {block['content_type']}\n"
+        all_text += f"Words: {block['word_count']} | Public: {block['is_public_content']}\n"
+        all_text += f"Content: {block['content']}\n\n"
+    all_text += "="*50
+    # Split into chunks
+    splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len
+    )
     chunks = splitter.split_text(all_text)
     documents = [Document(page_content=chunk) for chunk in chunks]
+    return "simple", documents  # Return simple mode instead of vectorstore
 def create_chatbot(vectorstore):
+    """Create conversational chatbot"""
+    try:
+        llm = get_llm()
+        if llm is None:
+            return "simple"  # Return simple mode if LLM fails
+        memory = ConversationBufferMemory(
+            memory_key="chat_history",
+            return_messages=True,
+            output_key="answer"
+        )
+        chain = ConversationalRetrievalChain.from_llm(
+            llm=llm,
+            retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
+            memory=memory,
+            return_source_documents=True,
+            output_key="answer"
+        )
+        return chain
+    except Exception as e:
+        st.error(f"Chatbot creation failed: {str(e)}")
+        return "simple"  # Fallback to simple mode
 def main():
+    st.title("📘 Facebook Data Extractor")
+    st.markdown("**University Project** - Real data when possible, realistic demo data when restricted")
     if st.button("← Back to Main Dashboard"):
         st.switch_page("app.py")
+    # Initialize session state
     if "extractor" not in st.session_state:
         st.session_state.extractor = FacebookDataSimulator()
     if "facebook_data" not in st.session_state:
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
     if "processing_mode" not in st.session_state:
+        st.session_state.processing_mode = "ai"  # ai or simple
     # Sidebar
     with st.sidebar:
         st.header("⚙️ Facebook Configuration")
+        data_type = st.selectbox(
+            "Content Type",
+            ["group", "page", "event", "post", "general"],
+            help="Select the type of Facebook content"
+        )
+        facebook_url = st.text_input(
+            "Facebook URL",
+            placeholder="https://www.facebook.com/groups/gamersofbangladesh2",
+            help="Enter any Facebook URL for analysis"
+        )
+        # Processing mode
+        st.subheader("🔧 Processing Mode")
+        processing_mode = st.radio(
+            "Choose analysis mode:",
+            ["AI Analysis (Recommended)", "Simple Analysis"],
+            help="AI Analysis uses embeddings, Simple uses rule-based"
+        )
+        st.session_state.processing_mode = "ai" if processing_mode == "AI Analysis (Recommended)" else "simple"
+        # Quick test URLs
+        st.markdown("### 🚀 Test URLs")
+        test_urls = {
+            "Gaming Group": "https://www.facebook.com/groups/gamersofbangladesh2",
+            "Tech Community": "https://www.facebook.com/groups/programmingcommunity",
+            "Business Page": "https://www.facebook.com/Meta/",
+        }
+        for name, url in test_urls.items():
+            if st.button(f"🔗 {name}", key=f"fb_{name}"):
+                st.session_state.current_fb_url = url
+                st.rerun()
+        if st.button("🚀 Extract Facebook Data", type="primary"):
+            url_to_use = facebook_url or getattr(st.session_state, 'current_fb_url', '')
+            if not url_to_use:
+                st.error("❌ Please enter a Facebook URL")
+            elif 'facebook.com' not in url_to_use:
+                st.error("❌ Please enter a valid Facebook URL")
             else:
                 with st.spinner("🔄 Analyzing Facebook data..."):
                     extracted_data = st.session_state.extractor.extract_data(url_to_use, data_type)
                     if extracted_data.get("status") == "success":
                         st.session_state.facebook_data = extracted_data
+                        # Process based on selected mode
+                        if st.session_state.processing_mode == "ai":
+                            result = process_facebook_data(extracted_data)
+                            if result and result[0] != "simple":
+                                st.session_state.vectorstore = result[0]
+                                st.session_state.chatbot = create_chatbot(result[0])
+                                st.session_state.chat_history = []
+                                st.success("✅ AI analysis ready!")
                             else:
+                                st.warning("⚠️ Using simple analysis (AI features limited)")
                                 st.session_state.chatbot = "simple"
+                                st.session_state.chat_history = []
                         else:
                             st.session_state.chatbot = "simple"
+                            st.session_state.chat_history = []
+                            st.success("✅ Simple analysis ready!")
+                        source = extracted_data.get('source', 'unknown')
+                        if source == 'demo':
+                            st.warning("📝 Using realistic demo data (Facebook restrictions active)")
+                        else:
+                            st.success("✅ Real data extracted successfully!")
                     else:
+                        error_msg = extracted_data.get("error", "Unknown error")
+                        st.error(f"❌ Extraction failed: {error_msg}")
+        if st.session_state.facebook_data:
+            st.markdown("---")
+            if st.button("🗑️ Clear Data", type="secondary"):
+                st.session_state.facebook_data = None
+                st.session_state.vectorstore = None
+                st.session_state.chatbot = None
+                st.session_state.chat_history = []
+                st.rerun()
+    # Main content
+    col1, col2 = st.columns([1, 1])
     with col1:
         st.header("📊 Extraction Results")
         if st.session_state.facebook_data:
             data = st.session_state.facebook_data
+            page_info = data['page_info']
+            content_blocks = data['content_blocks']
+            source = data.get('source', 'unknown')
+            if source == 'demo':
+                st.warning("📝 **Demo Data** - Realistic simulation (Facebook restrictions)")
+            else:
+                st.success("✅ **Real Data** - Successfully extracted")
+            # Show processing mode
+            if st.session_state.processing_mode == "simple":
+                st.info("🔧 **Simple Analysis Mode** - Rule-based processing")
+            else:
+                st.info("🤖 **AI Analysis Mode** - Embedding-based processing")
+            # Metrics
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Content Blocks", len(content_blocks))
+            with col2:
+                st.metric("Data Source", source.upper())
+            with col3:
+                st.metric("Analysis Mode", "AI" if st.session_state.processing_mode == "ai" else "Simple")
+            # Page info
+            st.subheader("🏷️ Page Information")
             st.write(f"**Title:** {page_info['title']}")
+            st.write(f"**URL Type:** {data['url_type']}")
+            st.write(f"**Description:** {page_info.get('description', 'No description')}")
+            if page_info.get('member_count'):
+                st.write(f"**Members:** {page_info['member_count']}")
+            elif page_info.get('follower_count'):
+                st.write(f"**Followers:** {page_info['follower_count']}")
+            st.write(f"**Access:** {page_info.get('access_note', 'Public content')}")
+            # Content samples
+            st.subheader("📝 Content Analysis")
+            for i, block in enumerate(content_blocks):
+                with st.expander(f"Content {i+1} - {block['content_type']} ({block['word_count']} words)"):
+                    st.write(block['content'])
+                    st.caption(f"Public: {block['is_public_content']}")
+        else:
+            st.info("""
+            ## 📘 Facebook Data Extractor
+            **University Project Feature**
+            **How it works:**
+            1. Enter any Facebook URL
+            2. System tries real data extraction
+            3. If blocked, uses **realistic demo data**
+            4. Choose between AI or Simple analysis
+            **Analysis Modes:**
+            - 🤖 **AI Analysis**: Uses embeddings and Mistral AI
+            - 🔧 **Simple Analysis**: Rule-based (works without embeddings)
+            **Perfect for demonstrating:**
+            - Social media data extraction concepts
+            - AI analysis capabilities
+            - Platform integration
+            - Error handling strategies
+            """)
     with col2:
+        st.header("💬 Analysis Chat")
+        if st.session_state.chatbot and st.session_state.facebook_data:
+            # Display chat history
+            for chat in st.session_state.chat_history:
+                if chat["role"] == "user":
+                    with st.chat_message("user"):
+                        st.write(chat['content'])
+                elif chat["role"] == "assistant":
+                    with st.chat_message("assistant"):
+                        st.write(chat['content'])
+            # Chat input
+            user_input = st.chat_input("Ask about the Facebook data...")
             if user_input:
+                st.session_state.chat_history.append({"role": "user", "content": user_input})
+                with st.spinner("🤔 Analyzing..."):
+                    try:
+                        if st.session_state.chatbot == "simple":
+                            # Use simple analysis
+                            response = simple_chat_analysis(user_input, st.session_state.facebook_data)
+                            st.session_state.chat_history.append({"role": "assistant", "content": response})
+                        else:
+                            # Use AI chatbot
+                            response = st.session_state.chatbot.invoke({"question": user_input})
+                            answer = response.get("answer", "I couldn't generate a response.")
+                            st.session_state.chat_history.append({"role": "assistant", "content": answer})
+                        st.rerun()
+                    except Exception as e:
+                        error_msg = f"Analysis Error: {str(e)}"
+                        st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
+                        st.rerun()
+            # Suggested questions
+            if not st.session_state.chat_history:
+                st.subheader("💡 Try asking:")
+                suggestions = [
+                    "What is this Facebook group/page about?",
+                    "Summarize the main content and purpose",
+                    "What kind of community is this?",
+                    "Analyze the engagement and activity level"
+                ]
+                for suggestion in suggestions:
+                    if st.button(suggestion, key=f"fb_suggest_{suggestion}"):
+                        st.info(f"Type: '{suggestion}' in chat")
+        elif st.session_state.facebook_data:
+            st.info("💬 Start chatting about the Facebook data")
+        else:
+            st.info("🔍 Extract Facebook data to enable analysis")
+if __name__ == "__main__":
+    main()