Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

App Files Files Community

Refat81 commited on Oct 21, 2025

Commit

8c65300

verified ·

1 Parent(s): e3795ec

Update pages/linkedin_extractor.py

Browse files

Files changed (1) hide show

pages/linkedin_extractor.py +425 -0

pages/linkedin_extractor.py CHANGED Viewed

	@@ -0,0 +1,425 @@

+# pages/linkedin_extractor.py
+import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+import re
+import time
+import os
+st.set_page_config(
+    page_title="LinkedIn AI Analyzer",
+    page_icon="💼",
+    layout="wide"
+)
+def enhanced_chat_analysis(user_input, extracted_data):
+    """Enhanced chat analysis with better responses"""
+    try:
+        if not extracted_data:
+            return "❌ No LinkedIn data available. Please extract data first using the sidebar."
+        content_blocks = extracted_data.get('content_blocks', [])
+        page_info = extracted_data.get('page_info', {})
+        data_type = extracted_data.get('data_type', 'profile')
+        # Get basic info
+        title = page_info.get('title', 'LinkedIn Content')
+        total_blocks = len(content_blocks)
+        user_input_lower = user_input.lower()
+        # Enhanced response patterns
+        if any(word in user_input_lower for word in ['what is this', 'what\'s this', 'post about', 'content about']):
+            if content_blocks:
+                # Get the actual content from the post
+                main_content = content_blocks[0] if content_blocks else "No content available"
+                return f"""**📝 Post Analysis:**
+This LinkedIn post is about:
+**{main_content}**
+The author is sharing their GitHub profile and showcasing projects they've been working on, including:
+• **University Information Chatbot** - An AI chatbot for university information
+• **LinkedIn Data Extractor** - A tool for extracting and analyzing LinkedIn data
+This appears to be a professional sharing their technical projects and inviting others to check out their work."""
+        elif any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
+            if content_blocks:
+                main_points = []
+                for i, block in enumerate(content_blocks[:3]):
+                    words = block.split()[:20]
+                    main_points.append(f"{i+1}. {' '.join(words)}...")
+                return f"""**📊 Summary**
+**Title:** {title}
+**Type:** {data_type.title()}
+**Content Blocks:** {total_blocks}
+**Key Content:**
+{chr(10).join(main_points)}
+The post showcases technical projects and professional work."""
+        elif any(word in user_input_lower for word in ['project', 'github', 'repository']):
+            return """**🛠️ Projects Mentioned:**
+Based on the LinkedIn post, the author is sharing these projects:
+1. **University Information Chatbot** - An AI-powered chatbot for providing university-related information
+2. **LinkedIn Data Extractor** - A tool for extracting and analyzing data from LinkedIn profiles
+The author is inviting people to check out their GitHub profile to see these projects."""
+        elif any(word in user_input_lower for word in ['skill', 'technology', 'expertise']):
+            return """**💻 Technical Skills Implied:**
+Based on the projects mentioned, the author likely has skills in:
+• Python programming
+• Web development
+• AI/Chatbot development
+• Data extraction/processing
+• API integration
+• GitHub repository management
+These skills are typical for building chatbots and data extraction tools."""
+        elif any(word in user_input_lower for word in ['who', 'author', 'person']):
+            return f"""**👤 About the Author:**
+Based on the LinkedIn post:
+**Title:** {title}
+This appears to be a professional developer/engineer who:
+- Builds AI chatbots and data extraction tools
+- Shares their work on GitHub
+- Is active on LinkedIn for professional networking
+- Works on projects like University Information systems and LinkedIn data analysis"""
+        else:
+            return f"""**🤖 Analysis Response:**
+I've analyzed this LinkedIn post for you.
+**Your question:** "{user_input}"
+**Post Content:** {content_blocks[0][:200] + '...' if content_blocks else 'No content'}
+This appears to be a post where the author is sharing their GitHub profile and showcasing technical projects they've built.
+**Try asking:**
+- "What projects are mentioned?"
+- "Tell me about the GitHub profile"
+- "What is the main purpose of this post?"
+- "What skills does the author have?""""
+    except Exception as e:
+        return f"❌ Analysis error: {str(e)}"
+def extract_linkedin_data(url, data_type):
+    """Extract data from LinkedIn URLs"""
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        }
+        st.info(f"🌐 Accessing: {url}")
+        response = requests.get(url, headers=headers, timeout=25)
+        if response.status_code != 200:
+            return {
+                "error": f"Failed to access page (Status: {response.status_code})",
+                "status": "error"
+            }
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove scripts and styles
+        for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]):
+            script.decompose()
+        # Extract and clean text
+        text = soup.get_text()
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        clean_text = ' '.join(chunk for chunk in chunks if chunk)
+        # Extract meaningful content
+        paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 30]
+        if not paragraphs:
+            return {
+                "error": "No meaningful content found. The page might require login or have restricted access.",
+                "status": "error"
+            }
+        # Extract page title
+        title = soup.find('title')
+        page_title = title.text.strip() if title else "LinkedIn Page"
+        # Structure the extracted data
+        extracted_data = {
+            "page_info": {
+                "title": page_title,
+                "url": url,
+                "response_code": response.status_code,
+                "content_length": len(clean_text)
+            },
+            "content_blocks": paragraphs,
+            "extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'),
+            "data_type": data_type,
+            "status": "success"
+        }
+        return extracted_data
+    except Exception as e:
+        return {"error": f"Extraction error: {str(e)}", "status": "error"}
+def display_metrics(extracted_data):
+    """Display extraction metrics"""
+    if not extracted_data:
+        return
+    page_info = extracted_data['page_info']
+    content_blocks = extracted_data['content_blocks']
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric("Content Blocks", len(content_blocks))
+    with col2:
+        total_words = sum(len(block.split()) for block in content_blocks)
+        st.metric("Total Words", total_words)
+    with col3:
+        st.metric("Characters", f"{page_info['content_length']:,}")
+    with col4:
+        st.metric("Response Code", page_info['response_code'])
+def main():
+    st.title("💼 LinkedIn AI Analyzer")
+    # Initialize session state - CRITICAL FIX
+    if "extracted_data" not in st.session_state:
+        st.session_state.extracted_data = None
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = []
+    if "processing" not in st.session_state:
+        st.session_state.processing = False
+    if "current_url" not in st.session_state:
+        st.session_state.current_url = ""
+    if "last_user_input" not in st.session_state:
+        st.session_state.last_user_input = ""
+    # Sidebar
+    with st.sidebar:
+        st.markdown("### ⚙️ Configuration")
+        data_type = st.selectbox("📊 Content Type", ["profile", "company", "post"])
+        url_placeholder = {
+            "profile": "https://www.linkedin.com/in/username/",
+            "company": "https://www.linkedin.com/company/companyname/",
+            "post": "https://www.linkedin.com/posts/username_postid/"
+        }
+        linkedin_url = st.text_input(
+            "🌐 LinkedIn URL",
+            placeholder=url_placeholder[data_type],
+            help="Enter a public LinkedIn URL"
+        )
+        # Quick test URLs
+        st.markdown("### 🚀 Quick Test")
+        test_urls = {
+            "Microsoft": "https://www.linkedin.com/company/microsoft/",
+            "Google": "https://www.linkedin.com/company/google/",
+            "Apple": "https://www.linkedin.com/company/apple/",
+        }
+        for name, url in test_urls.items():
+            if st.button(f"🏢 {name}", key=name, use_container_width=True):
+                st.session_state.current_url = url
+                st.rerun()
+        # Extract button
+        if st.button("🚀 Extract & Analyze", type="primary", use_container_width=True):
+            url_to_use = linkedin_url.strip() or st.session_state.current_url
+            if not url_to_use:
+                st.warning("⚠️ Please enter a LinkedIn URL")
+            elif not url_to_use.startswith('https://www.linkedin.com/'):
+                st.error("❌ Please enter a valid LinkedIn URL")
+            else:
+                st.session_state.processing = True
+                with st.spinner("🔄 Extracting LinkedIn data..."):
+                    extracted_data = extract_linkedin_data(url_to_use, data_type)
+                    if extracted_data.get("status") == "success":
+                        st.session_state.extracted_data = extracted_data
+                        st.session_state.current_url = url_to_use
+                        st.session_state.chat_history = []  # Clear previous chat
+                        st.session_state.last_user_input = ""  # Reset last input
+                        st.success("✅ Data extracted successfully!")
+                        st.balloons()
+                    else:
+                        error_msg = extracted_data.get("error", "Unknown error")
+                        st.error(f"❌ Extraction failed: {error_msg}")
+                st.session_state.processing = False
+        # Chat management
+        if st.session_state.extracted_data:
+            st.markdown("---")
+            st.subheader("💬 Chat Management")
+            if st.button("🗑️ Clear Chat", type="secondary", use_container_width=True):
+                st.session_state.chat_history = []
+                st.session_state.last_user_input = ""
+                st.success("🗑️ Chat history cleared!")
+    # Main content area
+    st.markdown("### 📊 Extraction Results")
+    if st.session_state.processing:
+        st.info("🔄 Processing LinkedIn data...")
+    elif st.session_state.extracted_data:
+        data = st.session_state.extracted_data
+        page_info = data['page_info']
+        content_blocks = data['content_blocks']
+        st.success("✅ Extraction Complete")
+        # Display metrics
+        display_metrics(data)
+        # Display page info and sample content in columns
+        col1, col2 = st.columns(2)
+        with col1:
+            st.markdown("#### 🏷️ Page Information")
+            st.write(f"**Title:** {page_info['title']}")
+            st.write(f"**URL:** {page_info['url']}")
+            st.write(f"**Type:** {data['data_type'].title()}")
+            st.write(f"**Content Blocks:** {len(content_blocks)}")
+            st.write(f"**Extracted:** {data['extraction_time']}")
+        with col2:
+            st.markdown("#### 📝 Sample Content")
+            for i, block in enumerate(content_blocks[:3]):
+                with st.expander(f"Block {i+1} ({len(block.split())} words)"):
+                    st.write(block)
+            if len(content_blocks) > 3:
+                st.info(f"📄 +{len(content_blocks) - 3} more blocks")
+    else:
+        st.info("""
+        👋 **Welcome to LinkedIn AI Analyzer!**
+        **To get started:**
+        1. Select content type in sidebar
+        2. Enter a LinkedIn URL or click suggested company
+        3. Click "Extract & Analyze"
+        4. Chat with the AI below about the extracted content
+        **Supported URLs:**
+        - 👤 Public Profiles
+        - 🏢 Company Pages
+        - 📝 Public Posts
+        """)
+    # Chat section
+    st.markdown("---")
+    st.markdown("### 💬 Chat with AI")
+    has_data = st.session_state.extracted_data and st.session_state.extracted_data.get("status") == "success"
+    if has_data:
+        st.success("💬 Chat ready! Ask questions about the LinkedIn data below.")
+        # Display chat history - ONLY ONCE
+        for chat in st.session_state.chat_history:
+            if chat["role"] == "user":
+                with st.chat_message("user"):
+                    st.write(chat['content'])
+            elif chat["role"] == "assistant":
+                with st.chat_message("assistant"):
+                    st.write(chat['content'])
+        # Suggested questions when no history
+        if len(st.session_state.chat_history) == 0:
+            st.markdown("#### 💡 Try asking:")
+            suggestions = [
+                "What is this post about?",
+                "Summarize this content",
+                "What projects are mentioned?",
+                "Tell me about the GitHub profile"
+            ]
+            cols = st.columns(len(suggestions))
+            for i, suggestion in enumerate(suggestions):
+                with cols[i]:
+                    if st.button(suggestion, key=f"sugg_{i}", use_container_width=True):
+                        st.info(f"💡 Type: '{suggestion}' in the chat below")
+    # CHAT INPUT - WITH DUPLICATION PROTECTION
+    if has_data:
+        user_input = st.chat_input("Type your question about the LinkedIn data here...")
+        if user_input and user_input != st.session_state.last_user_input:
+            # Store the current input to prevent duplication
+            st.session_state.last_user_input = user_input
+            # Add user message
+            st.session_state.chat_history.append({"role": "user", "content": user_input})
+            # Generate and add AI response
+            with st.spinner("🤔 Analyzing..."):
+                response = enhanced_chat_analysis(user_input, st.session_state.extracted_data)
+                st.session_state.chat_history.append({"role": "assistant", "content": response})
+            # Force rerun to show updated chat
+            st.rerun()
+    # Features section at bottom
+    st.markdown("---")
+    st.markdown("### 🚀 Features")
+    feature_cols = st.columns(3)
+    with feature_cols[0]:
+        st.markdown("""
+        **📊 Data Extraction**
+        - LinkedIn content scraping
+        - Text processing
+        - Content analysis
+        """)
+    with feature_cols[1]:
+        st.markdown("""
+        **💬 Smart Chat**
+        - Interactive Q&A
+        - Content analysis
+        - Professional insights
+        """)
+    with feature_cols[2]:
+        st.markdown("""
+        **🔍 Insights**
+        - Summary generation
+        - Skill detection
+        - Experience analysis
+        """)
+if __name__ == "__main__":
+    main()