Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

App Files Files Community

Refat81 commited on Oct 21, 2025

Commit

e3795ec

verified ·

1 Parent(s): 095b424

Update pages/linkedin_extractor.py

Browse files

Files changed (1) hide show

pages/linkedin_extractor.py +0 -425

pages/linkedin_extractor.py CHANGED Viewed

@@ -1,425 +0,0 @@
-# pages/linkedin_extractor.py
-import streamlit as st
-import requests
-from bs4 import BeautifulSoup
-import re
-import time
-import os
-st.set_page_config(
-    page_title="LinkedIn AI Analyzer",
-    page_icon="💼",
-    layout="wide"
-)
-def enhanced_chat_analysis(user_input, extracted_data):
-    """Enhanced chat analysis with better responses"""
-    try:
-        if not extracted_data:
-            return "❌ No LinkedIn data available. Please extract data first using the sidebar."
-        content_blocks = extracted_data.get('content_blocks', [])
-        page_info = extracted_data.get('page_info', {})
-        data_type = extracted_data.get('data_type', 'profile')
-        # Get basic info
-        title = page_info.get('title', 'LinkedIn Content')
-        total_blocks = len(content_blocks)
-        user_input_lower = user_input.lower()
-        # Enhanced response patterns
-        if any(word in user_input_lower for word in ['what is this', 'what\'s this', 'post about', 'content about']):
-            if content_blocks:
-                # Get the actual content from the post
-                main_content = content_blocks[0] if content_blocks else "No content available"
-                return f"""**📝 Post Analysis:**
-This LinkedIn post is about:
-**{main_content}**
-The author is sharing their GitHub profile and showcasing projects they've been working on, including:
-• **University Information Chatbot** - An AI chatbot for university information
-• **LinkedIn Data Extractor** - A tool for extracting and analyzing LinkedIn data
-This appears to be a professional sharing their technical projects and inviting others to check out their work."""
-        elif any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
-            if content_blocks:
-                main_points = []
-                for i, block in enumerate(content_blocks[:3]):
-                    words = block.split()[:20]
-                    main_points.append(f"{i+1}. {' '.join(words)}...")
-                return f"""**📊 Summary**
-**Title:** {title}
-**Type:** {data_type.title()}
-**Content Blocks:** {total_blocks}
-**Key Content:**
-{chr(10).join(main_points)}
-The post showcases technical projects and professional work."""
-        elif any(word in user_input_lower for word in ['project', 'github', 'repository']):
-            return """**🛠️ Projects Mentioned:**
-Based on the LinkedIn post, the author is sharing these projects:
-1. **University Information Chatbot** - An AI-powered chatbot for providing university-related information
-2. **LinkedIn Data Extractor** - A tool for extracting and analyzing data from LinkedIn profiles
-The author is inviting people to check out their GitHub profile to see these projects."""
-        elif any(word in user_input_lower for word in ['skill', 'technology', 'expertise']):
-            return """**💻 Technical Skills Implied:**
-Based on the projects mentioned, the author likely has skills in:
-• Python programming
-• Web development
-• AI/Chatbot development
-• Data extraction/processing
-• API integration
-• GitHub repository management
-These skills are typical for building chatbots and data extraction tools."""
-        elif any(word in user_input_lower for word in ['who', 'author', 'person']):
-            return f"""**👤 About the Author:**
-Based on the LinkedIn post:
-**Title:** {title}
-This appears to be a professional developer/engineer who:
-- Builds AI chatbots and data extraction tools
-- Shares their work on GitHub
-- Is active on LinkedIn for professional networking
-- Works on projects like University Information systems and LinkedIn data analysis"""
-        else:
-            return f"""**🤖 Analysis Response:**
-I've analyzed this LinkedIn post for you.
-**Your question:** "{user_input}"
-**Post Content:** {content_blocks[0][:200] + '...' if content_blocks else 'No content'}
-This appears to be a post where the author is sharing their GitHub profile and showcasing technical projects they've built.
-**Try asking:**
-- "What projects are mentioned?"
-- "Tell me about the GitHub profile"
-- "What is the main purpose of this post?"
-- "What skills does the author have?""""
-    except Exception as e:
-        return f"❌ Analysis error: {str(e)}"
-def extract_linkedin_data(url, data_type):
-    """Extract data from LinkedIn URLs"""
-    try:
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-        }
-        st.info(f"🌐 Accessing: {url}")
-        response = requests.get(url, headers=headers, timeout=25)
-        if response.status_code != 200:
-            return {
-                "error": f"Failed to access page (Status: {response.status_code})",
-                "status": "error"
-            }
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Remove scripts and styles
-        for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]):
-            script.decompose()
-        # Extract and clean text
-        text = soup.get_text()
-        lines = (line.strip() for line in text.splitlines())
-        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-        clean_text = ' '.join(chunk for chunk in chunks if chunk)
-        # Extract meaningful content
-        paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 30]
-        if not paragraphs:
-            return {
-                "error": "No meaningful content found. The page might require login or have restricted access.",
-                "status": "error"
-            }
-        # Extract page title
-        title = soup.find('title')
-        page_title = title.text.strip() if title else "LinkedIn Page"
-        # Structure the extracted data
-        extracted_data = {
-            "page_info": {
-                "title": page_title,
-                "url": url,
-                "response_code": response.status_code,
-                "content_length": len(clean_text)
-            },
-            "content_blocks": paragraphs,
-            "extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'),
-            "data_type": data_type,
-            "status": "success"
-        }
-        return extracted_data
-    except Exception as e:
-        return {"error": f"Extraction error: {str(e)}", "status": "error"}
-def display_metrics(extracted_data):
-    """Display extraction metrics"""
-    if not extracted_data:
-        return
-    page_info = extracted_data['page_info']
-    content_blocks = extracted_data['content_blocks']
-    col1, col2, col3, col4 = st.columns(4)
-    with col1:
-        st.metric("Content Blocks", len(content_blocks))
-    with col2:
-        total_words = sum(len(block.split()) for block in content_blocks)
-        st.metric("Total Words", total_words)
-    with col3:
-        st.metric("Characters", f"{page_info['content_length']:,}")
-    with col4:
-        st.metric("Response Code", page_info['response_code'])
-def main():
-    st.title("💼 LinkedIn AI Analyzer")
-    # Initialize session state - CRITICAL FIX
-    if "extracted_data" not in st.session_state:
-        st.session_state.extracted_data = None
-    if "chat_history" not in st.session_state:
-        st.session_state.chat_history = []
-    if "processing" not in st.session_state:
-        st.session_state.processing = False
-    if "current_url" not in st.session_state:
-        st.session_state.current_url = ""
-    if "last_user_input" not in st.session_state:
-        st.session_state.last_user_input = ""
-    # Sidebar
-    with st.sidebar:
-        st.markdown("### ⚙️ Configuration")
-        data_type = st.selectbox("📊 Content Type", ["profile", "company", "post"])
-        url_placeholder = {
-            "profile": "https://www.linkedin.com/in/username/",
-            "company": "https://www.linkedin.com/company/companyname/",
-            "post": "https://www.linkedin.com/posts/username_postid/"
-        }
-        linkedin_url = st.text_input(
-            "🌐 LinkedIn URL",
-            placeholder=url_placeholder[data_type],
-            help="Enter a public LinkedIn URL"
-        )
-        # Quick test URLs
-        st.markdown("### 🚀 Quick Test")
-        test_urls = {
-            "Microsoft": "https://www.linkedin.com/company/microsoft/",
-            "Google": "https://www.linkedin.com/company/google/",
-            "Apple": "https://www.linkedin.com/company/apple/",
-        }
-        for name, url in test_urls.items():
-            if st.button(f"🏢 {name}", key=name, use_container_width=True):
-                st.session_state.current_url = url
-                st.rerun()
-        # Extract button
-        if st.button("🚀 Extract & Analyze", type="primary", use_container_width=True):
-            url_to_use = linkedin_url.strip() or st.session_state.current_url
-            if not url_to_use:
-                st.warning("⚠️ Please enter a LinkedIn URL")
-            elif not url_to_use.startswith('https://www.linkedin.com/'):
-                st.error("❌ Please enter a valid LinkedIn URL")
-            else:
-                st.session_state.processing = True
-                with st.spinner("🔄 Extracting LinkedIn data..."):
-                    extracted_data = extract_linkedin_data(url_to_use, data_type)
-                    if extracted_data.get("status") == "success":
-                        st.session_state.extracted_data = extracted_data
-                        st.session_state.current_url = url_to_use
-                        st.session_state.chat_history = []  # Clear previous chat
-                        st.session_state.last_user_input = ""  # Reset last input
-                        st.success("✅ Data extracted successfully!")
-                        st.balloons()
-                    else:
-                        error_msg = extracted_data.get("error", "Unknown error")
-                        st.error(f"❌ Extraction failed: {error_msg}")
-                st.session_state.processing = False
-        # Chat management
-        if st.session_state.extracted_data:
-            st.markdown("---")
-            st.subheader("💬 Chat Management")
-            if st.button("🗑️ Clear Chat", type="secondary", use_container_width=True):
-                st.session_state.chat_history = []
-                st.session_state.last_user_input = ""
-                st.success("🗑️ Chat history cleared!")
-    # Main content area
-    st.markdown("### 📊 Extraction Results")
-    if st.session_state.processing:
-        st.info("🔄 Processing LinkedIn data...")
-    elif st.session_state.extracted_data:
-        data = st.session_state.extracted_data
-        page_info = data['page_info']
-        content_blocks = data['content_blocks']
-        st.success("✅ Extraction Complete")
-        # Display metrics
-        display_metrics(data)
-        # Display page info and sample content in columns
-        col1, col2 = st.columns(2)
-        with col1:
-            st.markdown("#### 🏷️ Page Information")
-            st.write(f"**Title:** {page_info['title']}")
-            st.write(f"**URL:** {page_info['url']}")
-            st.write(f"**Type:** {data['data_type'].title()}")
-            st.write(f"**Content Blocks:** {len(content_blocks)}")
-            st.write(f"**Extracted:** {data['extraction_time']}")
-        with col2:
-            st.markdown("#### 📝 Sample Content")
-            for i, block in enumerate(content_blocks[:3]):
-                with st.expander(f"Block {i+1} ({len(block.split())} words)"):
-                    st.write(block)
-            if len(content_blocks) > 3:
-                st.info(f"📄 +{len(content_blocks) - 3} more blocks")
-    else:
-        st.info("""
-        👋 **Welcome to LinkedIn AI Analyzer!**
-        **To get started:**
-        1. Select content type in sidebar
-        2. Enter a LinkedIn URL or click suggested company
-        3. Click "Extract & Analyze"
-        4. Chat with the AI below about the extracted content
-        **Supported URLs:**
-        - 👤 Public Profiles
-        - 🏢 Company Pages
-        - 📝 Public Posts
-        """)
-    # Chat section
-    st.markdown("---")
-    st.markdown("### 💬 Chat with AI")
-    has_data = st.session_state.extracted_data and st.session_state.extracted_data.get("status") == "success"
-    if has_data:
-        st.success("💬 Chat ready! Ask questions about the LinkedIn data below.")
-        # Display chat history - ONLY ONCE
-        for chat in st.session_state.chat_history:
-            if chat["role"] == "user":
-                with st.chat_message("user"):
-                    st.write(chat['content'])
-            elif chat["role"] == "assistant":
-                with st.chat_message("assistant"):
-                    st.write(chat['content'])
-        # Suggested questions when no history
-        if len(st.session_state.chat_history) == 0:
-            st.markdown("#### 💡 Try asking:")
-            suggestions = [
-                "What is this post about?",
-                "Summarize this content",
-                "What projects are mentioned?",
-                "Tell me about the GitHub profile"
-            ]
-            cols = st.columns(len(suggestions))
-            for i, suggestion in enumerate(suggestions):
-                with cols[i]:
-                    if st.button(suggestion, key=f"sugg_{i}", use_container_width=True):
-                        st.info(f"💡 Type: '{suggestion}' in the chat below")
-    # CHAT INPUT - WITH DUPLICATION PROTECTION
-    if has_data:
-        user_input = st.chat_input("Type your question about the LinkedIn data here...")
-        if user_input and user_input != st.session_state.last_user_input:
-            # Store the current input to prevent duplication
-            st.session_state.last_user_input = user_input
-            # Add user message
-            st.session_state.chat_history.append({"role": "user", "content": user_input})
-            # Generate and add AI response
-            with st.spinner("🤔 Analyzing..."):
-                response = enhanced_chat_analysis(user_input, st.session_state.extracted_data)
-                st.session_state.chat_history.append({"role": "assistant", "content": response})
-            # Force rerun to show updated chat
-            st.rerun()
-    # Features section at bottom
-    st.markdown("---")
-    st.markdown("### 🚀 Features")
-    feature_cols = st.columns(3)
-    with feature_cols[0]:
-        st.markdown("""
-        **📊 Data Extraction**
-        - LinkedIn content scraping
-        - Text processing
-        - Content analysis
-        """)
-    with feature_cols[1]:
-        st.markdown("""
-        **💬 Smart Chat**
-        - Interactive Q&A
-        - Content analysis
-        - Professional insights
-        """)
-    with feature_cols[2]:
-        st.markdown("""
-        **🔍 Insights**
-        - Summary generation
-        - Skill detection
-        - Experience analysis
-        """)
-if __name__ == "__main__":
-    main()