Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

App Files Files Community

Refat81 commited on Oct 21, 2025

Commit

47ac751

verified ·

1 Parent(s): 67a4166

Update pages/facebook_extractor.py

Browse files

Files changed (1) hide show

pages/facebook_extractor.py +210 -58

pages/facebook_extractor.py CHANGED Viewed

@@ -7,6 +7,7 @@ import re
 from datetime import datetime
 from typing import List, Dict
 import os
 # Import your existing AI components
 from langchain_text_splitters import CharacterTextSplitter
@@ -315,16 +316,53 @@ class FacebookDataSimulator:
             }
         }
-# AI Functions (same as your LinkedIn analyzer)
 def get_embeddings():
-    """Initialize embeddings"""
     try:
-        embeddings = HuggingFaceEmbeddings(
-            model_name="sentence-transformers/all-MiniLM-L6-v2"
-        )
-        return embeddings
     except Exception as e:
-        st.error(f"Embeddings error: {e}")
         return None
 def get_llm():
@@ -335,22 +373,115 @@ def get_llm():
             st.error("HuggingFace API Key not found")
             return None
-        llm = HuggingFaceHub(
-            repo_id="mistralai/Mistral-7B-Instruct-v0.1",
-            huggingfacehub_api_token=api_key,
-            model_kwargs={
-                "temperature": 0.7,
-                "max_length": 512,
-                "max_new_tokens": 256,
-            }
-        )
-        return llm
     except Exception as e:
-        st.error(f"LLM error: {e}")
         return None
 def process_facebook_data(extracted_data):
-    """Process extracted data for AI analysis"""
     if not extracted_data or extracted_data.get("status") != "success":
         return None, []
@@ -396,23 +527,14 @@ def process_facebook_data(extracted_data):
     chunks = splitter.split_text(all_text)
     documents = [Document(page_content=chunk) for chunk in chunks]
-    # Create vector store
-    try:
-        embeddings = get_embeddings()
-        if embeddings is None:
-            return None, []
-        vectorstore = FAISS.from_documents(documents, embeddings)
-        return vectorstore, chunks
-    except Exception as e:
-        st.error(f"Vector store failed: {e}")
-        return None, []
 def create_chatbot(vectorstore):
     """Create conversational chatbot"""
     try:
         llm = get_llm()
         if llm is None:
-            return None
         memory = ConversationBufferMemory(
             memory_key="chat_history",
@@ -430,7 +552,7 @@ def create_chatbot(vectorstore):
         return chain
     except Exception as e:
         st.error(f"Chatbot creation failed: {str(e)}")
-        return None
 def main():
     st.title("📘 Facebook Data Extractor")
@@ -450,6 +572,8 @@ def main():
         st.session_state.chatbot = None
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
     # Sidebar
     with st.sidebar:
@@ -467,6 +591,16 @@ def main():
             help="Enter any Facebook URL for analysis"
         )
         # Quick test URLs
         st.markdown("### 🚀 Test URLs")
         test_urls = {
@@ -494,20 +628,28 @@ def main():
                     if extracted_data.get("status") == "success":
                         st.session_state.facebook_data = extracted_data
-                        # Process for AI
-                        vectorstore, chunks = process_facebook_data(extracted_data)
-                        if vectorstore:
-                            st.session_state.vectorstore = vectorstore
-                            st.session_state.chatbot = create_chatbot(vectorstore)
-                            st.session_state.chat_history = []
-                            source = extracted_data.get('source', 'unknown')
-                            if source == 'demo':
-                                st.warning("📝 Using realistic demo data (Facebook restrictions active)")
                             else:
-                                st.success("✅ Real data extracted successfully!")
                         else:
-                            st.error("❌ Failed to process data for AI")
                     else:
                         error_msg = extracted_data.get("error", "Unknown error")
                         st.error(f"❌ Extraction failed: {error_msg}")
@@ -538,6 +680,12 @@ def main():
             else:
                 st.success("✅ **Real Data** - Successfully extracted")
             # Metrics
             col1, col2, col3 = st.columns(3)
             with col1:
@@ -545,7 +693,7 @@ def main():
             with col2:
                 st.metric("Data Source", source.upper())
             with col3:
-                st.metric("Status", "Success")
             # Page info
             st.subheader("🏷️ Page Information")
@@ -577,13 +725,11 @@ def main():
             1. Enter any Facebook URL
             2. System tries real data extraction
             3. If blocked, uses **realistic demo data**
-            4. Full AI analysis available
-            **Features:**
-            - Real data extraction when possible
-            - Realistic demo data when restricted
-            - Full AI-powered analysis
-            - Professional interface
             **Perfect for demonstrating:**
             - Social media data extraction concepts
@@ -593,7 +739,7 @@ def main():
             """)
     with col2:
-        st.header("💬 AI Analysis Chat")
         if st.session_state.chatbot and st.session_state.facebook_data:
             # Display chat history
@@ -611,14 +757,20 @@ def main():
             if user_input:
                 st.session_state.chat_history.append({"role": "user", "content": user_input})
-                with st.spinner("🤔 AI is analyzing..."):
                     try:
-                        response = st.session_state.chatbot.invoke({"question": user_input})
-                        answer = response.get("answer", "I couldn't generate a response.")
-                        st.session_state.chat_history.append({"role": "assistant", "content": answer})
                         st.rerun()
                     except Exception as e:
-                        error_msg = f"AI Error: {str(e)}"
                         st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
                         st.rerun()
@@ -637,9 +789,9 @@ def main():
                         st.info(f"Type: '{suggestion}' in chat")
         elif st.session_state.facebook_data:
-            st.info("💬 Start chatting with AI about the Facebook data")
         else:
-            st.info("🔍 Extract Facebook data to enable AI chat")
 if __name__ == "__main__":
     main()

 from datetime import datetime
 from typing import List, Dict
 import os
+import tempfile
 # Import your existing AI components
 from langchain_text_splitters import CharacterTextSplitter
             }
         }
 def get_embeddings():
+    """Initialize embeddings with better error handling and cache management"""
     try:
+        # Try multiple embedding models with different cache directories
+        model_options = [
+            "sentence-transformers/all-MiniLM-L6-v2",
+            "sentence-transformers/paraphrase-MiniLM-L3-v2",
+            "sentence-transformers/all-mpnet-base-v2"
+        ]
+        for model_name in model_options:
+            try:
+                st.info(f"🔄 Trying embedding model: {model_name}")
+                # Use temporary directory for cache to avoid permission issues
+                with tempfile.TemporaryDirectory() as temp_cache:
+                    embeddings = HuggingFaceEmbeddings(
+                        model_name=model_name,
+                        cache_folder=temp_cache,
+                        model_kwargs={'device': 'cpu'}
+                    )
+                    # Test the embeddings
+                    test_text = "Hello world"
+                    test_embedding = embeddings.embed_query(test_text)
+                    if test_embedding and len(test_embedding) > 0:
+                        st.success(f"✅ Loaded embeddings: {model_name.split('/')[-1]}")
+                        return embeddings
+            except Exception as e:
+                st.warning(f"⚠️ Failed to load {model_name}: {str(e)}")
+                continue
+        # If all models fail, try without cache
+        st.warning("🔄 Trying fallback embedding method...")
+        try:
+            embeddings = HuggingFaceEmbeddings(
+                model_name="sentence-transformers/all-MiniLM-L6-v2"
+            )
+            st.success("✅ Loaded fallback embeddings")
+            return embeddings
+        except Exception as e:
+            st.error(f"❌ All embedding models failed: {e}")
+            return None
     except Exception as e:
+        st.error(f"❌ Embeddings error: {e}")
         return None
 def get_llm():
             st.error("HuggingFace API Key not found")
             return None
+        # Try multiple models
+        model_options = [
+            "mistralai/Mistral-7B-Instruct-v0.1",
+            "google/flan-t5-large",
+            "microsoft/DialoGPT-large"
+        ]
+        for model_id in model_options:
+            try:
+                st.info(f"🔄 Trying LLM: {model_id}")
+                llm = HuggingFaceHub(
+                    repo_id=model_id,
+                    huggingfacehub_api_token=api_key,
+                    model_kwargs={
+                        "temperature": 0.7,
+                        "max_length": 512,
+                        "max_new_tokens": 256,
+                    }
+                )
+                # Test the model
+                test_response = llm.invoke("Hello")
+                if test_response and len(test_response.strip()) > 0:
+                    st.success(f"✅ Loaded LLM: {model_id.split('/')[-1]}")
+                    return llm
+            except Exception as e:
+                st.warning(f"⚠️ Failed to load {model_id}: {str(e)}")
+                continue
+        st.error("❌ All LLMs failed to load")
+        return None
     except Exception as e:
+        st.error(f"❌ LLM error: {e}")
         return None
+def simple_chat_analysis(user_input: str, extracted_data: Dict) -> str:
+    """Simple rule-based chat analysis when embeddings fail"""
+    try:
+        if not extracted_data:
+            return "No data available for analysis."
+        page_info = extracted_data.get('page_info', {})
+        content_blocks = extracted_data.get('content_blocks', [])
+        url_type = extracted_data.get('url_type', 'Facebook Content')
+        source = extracted_data.get('source', 'demo')
+        user_input_lower = user_input.lower()
+        # Basic analysis based on input
+        if any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
+            return f"""**📊 Summary of {page_info.get('title', 'Facebook Content')}**
+**Type:** {url_type}
+**Data Source:** {source.upper()}
+**Description:** {page_info.get('description', 'No description available')}
+This appears to be a {url_type.lower()} with {len(content_blocks)} content blocks of public information.
+**Key Content Types:**
+{', '.join(set(block['content_type'] for block in content_blocks))}
+The content focuses on community engagement and social interactions."""
+        elif any(word in user_input_lower for word in ['purpose', 'about', 'what is']):
+            return f"""**🎯 Purpose Analysis**
+Based on the extracted data, this {url_type.lower()} appears to be focused on:
+- **Community Building:** {len([b for b in content_blocks if 'community' in b['content_type'].lower()])} community-related posts
+- **Information Sharing:** {len([b for b in content_blocks if 'announcement' in b['content_type'].lower()])} announcements
+- **Member Engagement:** {len([b for b in content_blocks if 'post' in b['content_type'].lower()])} member posts
+**Overall Purpose:** {page_info.get('description', 'Community engagement and content sharing')}"""
+        elif any(word in user_input_lower for word in ['activity', 'engagement', 'active']):
+            active_blocks = len([b for b in content_blocks if any(word in b['content_type'].lower() for word in ['post', 'question', 'event'])])
+            return f"""**📈 Activity Analysis**
+**Content Activity Level:**
+- Total Content Blocks: {len(content_blocks)}
+- Active Engagement Posts: {active_blocks}
+- Informational Posts: {len(content_blocks) - active_blocks}
+The {url_type.lower()} shows a good mix of member engagement and informational content, suggesting an active community."""
+        else:
+            return f"""**🤖 Analysis Response**
+I've analyzed the {url_type.lower()} data for you.
+**Your question:** "{user_input}"
+**Content Source:** {source.upper()} data
+**Content Type:** {url_type}
+This {url_type.lower()} contains {len(content_blocks)} pieces of content focusing on community engagement and information sharing.
+**Try asking:**
+- "What is the main purpose of this group/page?"
+- "Summarize the content and activities"
+- "What kind of engagement does this content show?""""
+    except Exception as e:
+        return f"Analysis error: {str(e)}"
 def process_facebook_data(extracted_data):
+    """Process extracted data for AI analysis with fallbacks"""
     if not extracted_data or extracted_data.get("status") != "success":
         return None, []
     chunks = splitter.split_text(all_text)
     documents = [Document(page_content=chunk) for chunk in chunks]
+    return "simple", documents  # Return simple mode instead of vectorstore
 def create_chatbot(vectorstore):
     """Create conversational chatbot"""
     try:
         llm = get_llm()
         if llm is None:
+            return "simple"  # Return simple mode if LLM fails
         memory = ConversationBufferMemory(
             memory_key="chat_history",
         return chain
     except Exception as e:
         st.error(f"Chatbot creation failed: {str(e)}")
+        return "simple"  # Fallback to simple mode
 def main():
     st.title("📘 Facebook Data Extractor")
         st.session_state.chatbot = None
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
+    if "processing_mode" not in st.session_state:
+        st.session_state.processing_mode = "ai"  # ai or simple
     # Sidebar
     with st.sidebar:
             help="Enter any Facebook URL for analysis"
         )
+        # Processing mode
+        st.subheader("🔧 Processing Mode")
+        processing_mode = st.radio(
+            "Choose analysis mode:",
+            ["AI Analysis (Recommended)", "Simple Analysis"],
+            help="AI Analysis uses embeddings, Simple uses rule-based"
+        )
+        st.session_state.processing_mode = "ai" if processing_mode == "AI Analysis (Recommended)" else "simple"
         # Quick test URLs
         st.markdown("### 🚀 Test URLs")
         test_urls = {
                     if extracted_data.get("status") == "success":
                         st.session_state.facebook_data = extracted_data
+                        # Process based on selected mode
+                        if st.session_state.processing_mode == "ai":
+                            result = process_facebook_data(extracted_data)
+                            if result and result[0] != "simple":
+                                st.session_state.vectorstore = result[0]
+                                st.session_state.chatbot = create_chatbot(result[0])
+                                st.session_state.chat_history = []
+                                st.success("✅ AI analysis ready!")
                             else:
+                                st.warning("⚠️ Using simple analysis (AI features limited)")
+                                st.session_state.chatbot = "simple"
+                                st.session_state.chat_history = []
+                        else:
+                            st.session_state.chatbot = "simple"
+                            st.session_state.chat_history = []
+                            st.success("✅ Simple analysis ready!")
+                        source = extracted_data.get('source', 'unknown')
+                        if source == 'demo':
+                            st.warning("📝 Using realistic demo data (Facebook restrictions active)")
                         else:
+                            st.success("✅ Real data extracted successfully!")
                     else:
                         error_msg = extracted_data.get("error", "Unknown error")
                         st.error(f"❌ Extraction failed: {error_msg}")
             else:
                 st.success("✅ **Real Data** - Successfully extracted")
+            # Show processing mode
+            if st.session_state.processing_mode == "simple":
+                st.info("🔧 **Simple Analysis Mode** - Rule-based processing")
+            else:
+                st.info("🤖 **AI Analysis Mode** - Embedding-based processing")
             # Metrics
             col1, col2, col3 = st.columns(3)
             with col1:
             with col2:
                 st.metric("Data Source", source.upper())
             with col3:
+                st.metric("Analysis Mode", "AI" if st.session_state.processing_mode == "ai" else "Simple")
             # Page info
             st.subheader("🏷️ Page Information")
             1. Enter any Facebook URL
             2. System tries real data extraction
             3. If blocked, uses **realistic demo data**
+            4. Choose between AI or Simple analysis
+            **Analysis Modes:**
+            - 🤖 **AI Analysis**: Uses embeddings and Mistral AI
+            - 🔧 **Simple Analysis**: Rule-based (works without embeddings)
             **Perfect for demonstrating:**
             - Social media data extraction concepts
             """)
     with col2:
+        st.header("💬 Analysis Chat")
         if st.session_state.chatbot and st.session_state.facebook_data:
             # Display chat history
             if user_input:
                 st.session_state.chat_history.append({"role": "user", "content": user_input})
+                with st.spinner("🤔 Analyzing..."):
                     try:
+                        if st.session_state.chatbot == "simple":
+                            # Use simple analysis
+                            response = simple_chat_analysis(user_input, st.session_state.facebook_data)
+                            st.session_state.chat_history.append({"role": "assistant", "content": response})
+                        else:
+                            # Use AI chatbot
+                            response = st.session_state.chatbot.invoke({"question": user_input})
+                            answer = response.get("answer", "I couldn't generate a response.")
+                            st.session_state.chat_history.append({"role": "assistant", "content": answer})
                         st.rerun()
                     except Exception as e:
+                        error_msg = f"Analysis Error: {str(e)}"
                         st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
                         st.rerun()
                         st.info(f"Type: '{suggestion}' in chat")
         elif st.session_state.facebook_data:
+            st.info("💬 Start chatting about the Facebook data")
         else:
+            st.info("🔍 Extract Facebook data to enable analysis")
 if __name__ == "__main__":
     main()