Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

File size: 17,200 Bytes

# pages/linkedin_extractor.py
import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
import time
import os
# Add to TOP of each extractor file
import streamlit as st

# ============================================
# AUTHENTICATION CHECK
# ============================================
if "authenticated" not in st.session_state or not st.session_state.authenticated:
    st.set_page_config(page_title="Access Denied", page_icon="🔒", layout="centered")
    
    st.markdown("""
    <style>
        .error-container {
            text-align: center;
            padding: 3rem;
            background: linear-gradient(135deg, #ef4444, #dc2626);
            color: white;
            border-radius: 10px;
            margin: 2rem 0;
        }
    </style>
    
    <div class="error-container">
        <h1>🔐 Access Denied</h1>
        <p style="font-size: 1.2rem;">Please login to access this page</p>
    </div>
    """, unsafe_allow_html=True)
    
    st.markdown("""
    <div style="text-align: center; margin-top: 2rem;">
        <a href="/">
            <button style="
                background-color: #4285F4;
                color: white;
                padding: 12px 24px;
                border-radius: 6px;
                border: none;
                font-size: 16px;
                cursor: pointer;
            ">
                🔐 Go to Login Page
            </button>
        </a>
    </div>
    """, unsafe_allow_html=True)
    
    st.stop()

st.set_page_config(
    page_title="LinkedIn AI Analyzer",
    page_icon="💼",
    layout="wide"
)

def enhanced_chat_analysis(user_input, extracted_data):
    """Enhanced chat analysis with better responses"""
    try:
        if not extracted_data:
            return "❌ No LinkedIn data available. Please extract data first using the sidebar."
        
        content_blocks = extracted_data.get('content_blocks', [])
        page_info = extracted_data.get('page_info', {})
        data_type = extracted_data.get('data_type', 'profile')
        
        # Get basic info
        title = page_info.get('title', 'LinkedIn Content')
        total_blocks = len(content_blocks)
        
        user_input_lower = user_input.lower()
        
        # Enhanced response patterns
        if any(word in user_input_lower for word in ['what is this', 'what\'s this', 'post about', 'content about']):
            if content_blocks:
                # Get the actual content from the post
                main_content = content_blocks[0] if content_blocks else "No content available"
                return f"""**📝 Post Analysis:**

This LinkedIn post is about:

**{main_content}**

The author is sharing their GitHub profile and showcasing projects they've been working on, including:

• **University Information Chatbot** - An AI chatbot for university information
• **LinkedIn Data Extractor** - A tool for extracting and analyzing LinkedIn data

This appears to be a professional sharing their technical projects and inviting others to check out their work."""
        
        elif any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
            if content_blocks:
                main_points = []
                for i, block in enumerate(content_blocks[:3]):
                    words = block.split()[:20]
                    main_points.append(f"{i+1}. {' '.join(words)}...")
                
                return f"""**📊 Summary**

**Title:** {title}
**Type:** {data_type.title()}
**Content Blocks:** {total_blocks}

**Key Content:**
{chr(10).join(main_points)}

The post showcases technical projects and professional work."""
        
        elif any(word in user_input_lower for word in ['project', 'github', 'repository']):
            return """**🛠️ Projects Mentioned:**

Based on the LinkedIn post, the author is sharing these projects:

1. **University Information Chatbot** - An AI-powered chatbot for providing university-related information
2. **LinkedIn Data Extractor** - A tool for extracting and analyzing data from LinkedIn profiles

The author is inviting people to check out their GitHub profile to see these projects."""
        
        elif any(word in user_input_lower for word in ['skill', 'technology', 'expertise']):
            return """**💻 Technical Skills Implied:**

Based on the projects mentioned, the author likely has skills in:

• Python programming
• Web development
• AI/Chatbot development
• Data extraction/processing
• API integration
• GitHub repository management

These skills are typical for building chatbots and data extraction tools."""
        
        elif any(word in user_input_lower for word in ['who', 'author', 'person']):
            return f"""**👤 About the Author:**

Based on the LinkedIn post:

**Title:** {title}

This appears to be a professional developer/engineer who:
- Builds AI chatbots and data extraction tools
- Shares their work on GitHub
- Is active on LinkedIn for professional networking
- Works on projects like University Information systems and LinkedIn data analysis"""
        
        else:
            # FIXED: Using regular string with line breaks instead of triple quotes
            post_preview = content_blocks[0][:200] + '...' if content_blocks else 'No content'
            response_lines = [
                "**🤖 Analysis Response:**",
                "",
                f"I've analyzed this LinkedIn post for you.",
                "",
                f"**Your question:** \"{user_input}\"",
                "",
                f"**Post Content:** {post_preview}",
                "",
                "This appears to be a post where the author is sharing their GitHub profile and showcasing technical projects they've built.",
                "",
                "**Try asking:**",
                "- \"What projects are mentioned?\"",
                "- \"Tell me about the GitHub profile\"", 
                "- \"What is the main purpose of this post?\"",
                "- \"What skills does the author have?\""
            ]
            return "\n".join(response_lines)

    except Exception as e:
        return f"❌ Analysis error: {str(e)}"

def extract_linkedin_data(url, data_type):
    """Extract data from LinkedIn URLs"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        }
        
        st.info(f"🌐 Accessing: {url}")
        response = requests.get(url, headers=headers, timeout=25)
        
        if response.status_code != 200:
            return {
                "error": f"Failed to access page (Status: {response.status_code})",
                "status": "error"
            }
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove scripts and styles
        for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]):
            script.decompose()
        
        # Extract and clean text
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        clean_text = ' '.join(chunk for chunk in chunks if chunk)
        
        # Extract meaningful content
        paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 30]
        
        if not paragraphs:
            return {
                "error": "No meaningful content found. The page might require login or have restricted access.",
                "status": "error"
            }
        
        # Extract page title
        title = soup.find('title')
        page_title = title.text.strip() if title else "LinkedIn Page"
        
        # Structure the extracted data
        extracted_data = {
            "page_info": {
                "title": page_title,
                "url": url,
                "response_code": response.status_code,
                "content_length": len(clean_text)
            },
            "content_blocks": paragraphs,
            "extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'),
            "data_type": data_type,
            "status": "success"
        }
        
        return extracted_data
        
    except Exception as e:
        return {"error": f"Extraction error: {str(e)}", "status": "error"}

def display_metrics(extracted_data):
    """Display extraction metrics"""
    if not extracted_data:
        return
    
    page_info = extracted_data['page_info']
    content_blocks = extracted_data['content_blocks']
    
    col1, col2, col3, col4 = st.columns(4)
    
    with col1:
        st.metric("Content Blocks", len(content_blocks))
    
    with col2:
        total_words = sum(len(block.split()) for block in content_blocks)
        st.metric("Total Words", total_words)
    
    with col3:
        st.metric("Characters", f"{page_info['content_length']:,}")
    
    with col4:
        st.metric("Response Code", page_info['response_code'])

def main():
    st.title("💼 LinkedIn AI Analyzer")
    
    # Initialize session state
    if "extracted_data" not in st.session_state:
        st.session_state.extracted_data = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = []
    if "processing" not in st.session_state:
        st.session_state.processing = False
    if "current_url" not in st.session_state:
        st.session_state.current_url = ""
    if "last_user_input" not in st.session_state:
        st.session_state.last_user_input = ""
    
    # Sidebar
    with st.sidebar:
        st.markdown("### ⚙️ Configuration")
        
        data_type = st.selectbox("📊 Content Type", ["profile", "company", "post"])
        
        url_placeholder = {
            "profile": "https://www.linkedin.com/in/username/",
            "company": "https://www.linkedin.com/company/companyname/", 
            "post": "https://www.linkedin.com/posts/username_postid/"
        }
        
        linkedin_url = st.text_input(
            "🌐 LinkedIn URL",
            placeholder=url_placeholder[data_type],
            help="Enter a public LinkedIn URL"
        )
        
        # Quick test URLs
        st.markdown("### 🚀 Quick Test")
        test_urls = {
            "Microsoft": "https://www.linkedin.com/company/microsoft/",
            "Google": "https://www.linkedin.com/company/google/",
            "Apple": "https://www.linkedin.com/company/apple/",
        }
        
        for name, url in test_urls.items():
            if st.button(f"🏢 {name}", key=name, use_container_width=True):
                st.session_state.current_url = url
                st.rerun()
        
        # Extract button
        if st.button("🚀 Extract & Analyze", type="primary", use_container_width=True):
            url_to_use = linkedin_url.strip() or st.session_state.current_url
            
            if not url_to_use:
                st.warning("⚠️ Please enter a LinkedIn URL")
            elif not url_to_use.startswith('https://www.linkedin.com/'):
                st.error("❌ Please enter a valid LinkedIn URL")
            else:
                st.session_state.processing = True
                with st.spinner("🔄 Extracting LinkedIn data..."):
                    extracted_data = extract_linkedin_data(url_to_use, data_type)
                    
                    if extracted_data.get("status") == "success":
                        st.session_state.extracted_data = extracted_data
                        st.session_state.current_url = url_to_use
                        st.session_state.chat_history = []
                        st.session_state.last_user_input = ""
                        st.success("✅ Data extracted successfully!")
                        st.balloons()
                    else:
                        error_msg = extracted_data.get("error", "Unknown error")
                        st.error(f"❌ Extraction failed: {error_msg}")
                
                st.session_state.processing = False
        
        # Chat management
        if st.session_state.extracted_data:
            st.markdown("---")
            st.subheader("💬 Chat Management")
            if st.button("🗑️ Clear Chat", type="secondary", use_container_width=True):
                st.session_state.chat_history = []
                st.session_state.last_user_input = ""
                st.success("🗑️ Chat history cleared!")

    # Main content area
    st.markdown("### 📊 Extraction Results")
    
    if st.session_state.processing:
        st.info("🔄 Processing LinkedIn data...")
    
    elif st.session_state.extracted_data:
        data = st.session_state.extracted_data
        page_info = data['page_info']
        content_blocks = data['content_blocks']
        
        st.success("✅ Extraction Complete")
        
        # Display metrics
        display_metrics(data)
        
        # Display page info and sample content in columns
        col1, col2 = st.columns(2)
        
        with col1:
            st.markdown("#### 🏷️ Page Information")
            st.write(f"**Title:** {page_info['title']}")
            st.write(f"**URL:** {page_info['url']}")
            st.write(f"**Type:** {data['data_type'].title()}")
            st.write(f"**Content Blocks:** {len(content_blocks)}")
            st.write(f"**Extracted:** {data['extraction_time']}")
        
        with col2:
            st.markdown("#### 📝 Sample Content")
            for i, block in enumerate(content_blocks[:3]):
                with st.expander(f"Block {i+1} ({len(block.split())} words)"):
                    st.write(block)
            
            if len(content_blocks) > 3:
                st.info(f"📄 +{len(content_blocks) - 3} more blocks")
    
    else:
        st.info("""
        👋 **Welcome to LinkedIn AI Analyzer!**
        
        **To get started:**
        1. Select content type in sidebar
        2. Enter a LinkedIn URL or click suggested company
        3. Click "Extract & Analyze" 
        4. Chat with the AI below about the extracted content
        
        **Supported URLs:**
        - 👤 Public Profiles
        - 🏢 Company Pages  
        - 📝 Public Posts
        """)

    # Chat section
    st.markdown("---")
    st.markdown("### 💬 Chat with AI")
    
    has_data = st.session_state.extracted_data and st.session_state.extracted_data.get("status") == "success"
    
    if has_data:
        st.success("💬 Chat ready! Ask questions about the LinkedIn data below.")
        
        # Display chat history
        for chat in st.session_state.chat_history:
            if chat["role"] == "user":
                with st.chat_message("user"):
                    st.write(chat['content'])
            elif chat["role"] == "assistant":
                with st.chat_message("assistant"):
                    st.write(chat['content'])
        
        # Suggested questions when no history
        if len(st.session_state.chat_history) == 0:
            st.markdown("#### 💡 Try asking:")
            suggestions = [
                "What is this post about?",
                "Summarize this content",
                "What projects are mentioned?",
                "Tell me about the GitHub profile"
            ]
            
            cols = st.columns(len(suggestions))
            for i, suggestion in enumerate(suggestions):
                with cols[i]:
                    if st.button(suggestion, key=f"sugg_{i}", use_container_width=True):
                        st.info(f"💡 Type: '{suggestion}' in the chat below")

    # CHAT INPUT
    if has_data:
        user_input = st.chat_input("Type your question about the LinkedIn data here...")
        
        if user_input and user_input != st.session_state.last_user_input:
            st.session_state.last_user_input = user_input
            st.session_state.chat_history.append({"role": "user", "content": user_input})
            
            with st.spinner("🤔 Analyzing..."):
                response = enhanced_chat_analysis(user_input, st.session_state.extracted_data)
                st.session_state.chat_history.append({"role": "assistant", "content": response})
            
            st.rerun()

    # Features section
    st.markdown("---")
    st.markdown("### 🚀 Features")
    
    feature_cols = st.columns(3)
    
    with feature_cols[0]:
        st.markdown("""
        **📊 Data Extraction**
        - LinkedIn content scraping
        - Text processing
        - Content analysis
        """)
    
    with feature_cols[1]:
        st.markdown("""
        **💬 Smart Chat**
        - Interactive Q&A
        - Content analysis
        - Professional insights
        """)
    
    with feature_cols[2]:
        st.markdown("""
        **🔍 Insights**
        - Summary generation
        - Skill detection
        - Experience analysis
        """)

if __name__ == "__main__":
    main()