# pages/linkedin_extractor.py import streamlit as st import requests from bs4 import BeautifulSoup import re import time import os # Add to TOP of each extractor file import streamlit as st # ============================================ # AUTHENTICATION CHECK # ============================================ if "authenticated" not in st.session_state or not st.session_state.authenticated: st.set_page_config(page_title="Access Denied", page_icon="🔒", layout="centered") st.markdown("""

🔐 Access Denied

Please login to access this page

""", unsafe_allow_html=True) st.markdown("""
""", unsafe_allow_html=True) st.stop() st.set_page_config( page_title="LinkedIn AI Analyzer", page_icon="💼", layout="wide" ) def enhanced_chat_analysis(user_input, extracted_data): """Enhanced chat analysis with better responses""" try: if not extracted_data: return "❌ No LinkedIn data available. Please extract data first using the sidebar." content_blocks = extracted_data.get('content_blocks', []) page_info = extracted_data.get('page_info', {}) data_type = extracted_data.get('data_type', 'profile') # Get basic info title = page_info.get('title', 'LinkedIn Content') total_blocks = len(content_blocks) user_input_lower = user_input.lower() # Enhanced response patterns if any(word in user_input_lower for word in ['what is this', 'what\'s this', 'post about', 'content about']): if content_blocks: # Get the actual content from the post main_content = content_blocks[0] if content_blocks else "No content available" return f"""**📝 Post Analysis:** This LinkedIn post is about: **{main_content}** The author is sharing their GitHub profile and showcasing projects they've been working on, including: • **University Information Chatbot** - An AI chatbot for university information • **LinkedIn Data Extractor** - A tool for extracting and analyzing LinkedIn data This appears to be a professional sharing their technical projects and inviting others to check out their work.""" elif any(word in user_input_lower for word in ['summary', 'summarize', 'overview']): if content_blocks: main_points = [] for i, block in enumerate(content_blocks[:3]): words = block.split()[:20] main_points.append(f"{i+1}. {' '.join(words)}...") return f"""**📊 Summary** **Title:** {title} **Type:** {data_type.title()} **Content Blocks:** {total_blocks} **Key Content:** {chr(10).join(main_points)} The post showcases technical projects and professional work.""" elif any(word in user_input_lower for word in ['project', 'github', 'repository']): return """**🛠️ Projects Mentioned:** Based on the LinkedIn post, the author is sharing these projects: 1. **University Information Chatbot** - An AI-powered chatbot for providing university-related information 2. **LinkedIn Data Extractor** - A tool for extracting and analyzing data from LinkedIn profiles The author is inviting people to check out their GitHub profile to see these projects.""" elif any(word in user_input_lower for word in ['skill', 'technology', 'expertise']): return """**💻 Technical Skills Implied:** Based on the projects mentioned, the author likely has skills in: • Python programming • Web development • AI/Chatbot development • Data extraction/processing • API integration • GitHub repository management These skills are typical for building chatbots and data extraction tools.""" elif any(word in user_input_lower for word in ['who', 'author', 'person']): return f"""**👤 About the Author:** Based on the LinkedIn post: **Title:** {title} This appears to be a professional developer/engineer who: - Builds AI chatbots and data extraction tools - Shares their work on GitHub - Is active on LinkedIn for professional networking - Works on projects like University Information systems and LinkedIn data analysis""" else: # FIXED: Using regular string with line breaks instead of triple quotes post_preview = content_blocks[0][:200] + '...' if content_blocks else 'No content' response_lines = [ "**🤖 Analysis Response:**", "", f"I've analyzed this LinkedIn post for you.", "", f"**Your question:** \"{user_input}\"", "", f"**Post Content:** {post_preview}", "", "This appears to be a post where the author is sharing their GitHub profile and showcasing technical projects they've built.", "", "**Try asking:**", "- \"What projects are mentioned?\"", "- \"Tell me about the GitHub profile\"", "- \"What is the main purpose of this post?\"", "- \"What skills does the author have?\"" ] return "\n".join(response_lines) except Exception as e: return f"❌ Analysis error: {str(e)}" def extract_linkedin_data(url, data_type): """Extract data from LinkedIn URLs""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', } st.info(f"🌐 Accessing: {url}") response = requests.get(url, headers=headers, timeout=25) if response.status_code != 200: return { "error": f"Failed to access page (Status: {response.status_code})", "status": "error" } soup = BeautifulSoup(response.text, 'html.parser') # Remove scripts and styles for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]): script.decompose() # Extract and clean text text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) clean_text = ' '.join(chunk for chunk in chunks if chunk) # Extract meaningful content paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 30] if not paragraphs: return { "error": "No meaningful content found. The page might require login or have restricted access.", "status": "error" } # Extract page title title = soup.find('title') page_title = title.text.strip() if title else "LinkedIn Page" # Structure the extracted data extracted_data = { "page_info": { "title": page_title, "url": url, "response_code": response.status_code, "content_length": len(clean_text) }, "content_blocks": paragraphs, "extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'), "data_type": data_type, "status": "success" } return extracted_data except Exception as e: return {"error": f"Extraction error: {str(e)}", "status": "error"} def display_metrics(extracted_data): """Display extraction metrics""" if not extracted_data: return page_info = extracted_data['page_info'] content_blocks = extracted_data['content_blocks'] col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Content Blocks", len(content_blocks)) with col2: total_words = sum(len(block.split()) for block in content_blocks) st.metric("Total Words", total_words) with col3: st.metric("Characters", f"{page_info['content_length']:,}") with col4: st.metric("Response Code", page_info['response_code']) def main(): st.title("💼 LinkedIn AI Analyzer") # Initialize session state if "extracted_data" not in st.session_state: st.session_state.extracted_data = None if "chat_history" not in st.session_state: st.session_state.chat_history = [] if "processing" not in st.session_state: st.session_state.processing = False if "current_url" not in st.session_state: st.session_state.current_url = "" if "last_user_input" not in st.session_state: st.session_state.last_user_input = "" # Sidebar with st.sidebar: st.markdown("### ⚙️ Configuration") data_type = st.selectbox("📊 Content Type", ["profile", "company", "post"]) url_placeholder = { "profile": "https://www.linkedin.com/in/username/", "company": "https://www.linkedin.com/company/companyname/", "post": "https://www.linkedin.com/posts/username_postid/" } linkedin_url = st.text_input( "🌐 LinkedIn URL", placeholder=url_placeholder[data_type], help="Enter a public LinkedIn URL" ) # Quick test URLs st.markdown("### 🚀 Quick Test") test_urls = { "Microsoft": "https://www.linkedin.com/company/microsoft/", "Google": "https://www.linkedin.com/company/google/", "Apple": "https://www.linkedin.com/company/apple/", } for name, url in test_urls.items(): if st.button(f"🏢 {name}", key=name, use_container_width=True): st.session_state.current_url = url st.rerun() # Extract button if st.button("🚀 Extract & Analyze", type="primary", use_container_width=True): url_to_use = linkedin_url.strip() or st.session_state.current_url if not url_to_use: st.warning("⚠️ Please enter a LinkedIn URL") elif not url_to_use.startswith('https://www.linkedin.com/'): st.error("❌ Please enter a valid LinkedIn URL") else: st.session_state.processing = True with st.spinner("🔄 Extracting LinkedIn data..."): extracted_data = extract_linkedin_data(url_to_use, data_type) if extracted_data.get("status") == "success": st.session_state.extracted_data = extracted_data st.session_state.current_url = url_to_use st.session_state.chat_history = [] st.session_state.last_user_input = "" st.success("✅ Data extracted successfully!") st.balloons() else: error_msg = extracted_data.get("error", "Unknown error") st.error(f"❌ Extraction failed: {error_msg}") st.session_state.processing = False # Chat management if st.session_state.extracted_data: st.markdown("---") st.subheader("💬 Chat Management") if st.button("🗑️ Clear Chat", type="secondary", use_container_width=True): st.session_state.chat_history = [] st.session_state.last_user_input = "" st.success("🗑️ Chat history cleared!") # Main content area st.markdown("### 📊 Extraction Results") if st.session_state.processing: st.info("🔄 Processing LinkedIn data...") elif st.session_state.extracted_data: data = st.session_state.extracted_data page_info = data['page_info'] content_blocks = data['content_blocks'] st.success("✅ Extraction Complete") # Display metrics display_metrics(data) # Display page info and sample content in columns col1, col2 = st.columns(2) with col1: st.markdown("#### 🏷️ Page Information") st.write(f"**Title:** {page_info['title']}") st.write(f"**URL:** {page_info['url']}") st.write(f"**Type:** {data['data_type'].title()}") st.write(f"**Content Blocks:** {len(content_blocks)}") st.write(f"**Extracted:** {data['extraction_time']}") with col2: st.markdown("#### 📝 Sample Content") for i, block in enumerate(content_blocks[:3]): with st.expander(f"Block {i+1} ({len(block.split())} words)"): st.write(block) if len(content_blocks) > 3: st.info(f"📄 +{len(content_blocks) - 3} more blocks") else: st.info(""" 👋 **Welcome to LinkedIn AI Analyzer!** **To get started:** 1. Select content type in sidebar 2. Enter a LinkedIn URL or click suggested company 3. Click "Extract & Analyze" 4. Chat with the AI below about the extracted content **Supported URLs:** - 👤 Public Profiles - 🏢 Company Pages - 📝 Public Posts """) # Chat section st.markdown("---") st.markdown("### 💬 Chat with AI") has_data = st.session_state.extracted_data and st.session_state.extracted_data.get("status") == "success" if has_data: st.success("💬 Chat ready! Ask questions about the LinkedIn data below.") # Display chat history for chat in st.session_state.chat_history: if chat["role"] == "user": with st.chat_message("user"): st.write(chat['content']) elif chat["role"] == "assistant": with st.chat_message("assistant"): st.write(chat['content']) # Suggested questions when no history if len(st.session_state.chat_history) == 0: st.markdown("#### 💡 Try asking:") suggestions = [ "What is this post about?", "Summarize this content", "What projects are mentioned?", "Tell me about the GitHub profile" ] cols = st.columns(len(suggestions)) for i, suggestion in enumerate(suggestions): with cols[i]: if st.button(suggestion, key=f"sugg_{i}", use_container_width=True): st.info(f"💡 Type: '{suggestion}' in the chat below") # CHAT INPUT if has_data: user_input = st.chat_input("Type your question about the LinkedIn data here...") if user_input and user_input != st.session_state.last_user_input: st.session_state.last_user_input = user_input st.session_state.chat_history.append({"role": "user", "content": user_input}) with st.spinner("🤔 Analyzing..."): response = enhanced_chat_analysis(user_input, st.session_state.extracted_data) st.session_state.chat_history.append({"role": "assistant", "content": response}) st.rerun() # Features section st.markdown("---") st.markdown("### 🚀 Features") feature_cols = st.columns(3) with feature_cols[0]: st.markdown(""" **📊 Data Extraction** - LinkedIn content scraping - Text processing - Content analysis """) with feature_cols[1]: st.markdown(""" **💬 Smart Chat** - Interactive Q&A - Content analysis - Professional insights """) with feature_cols[2]: st.markdown(""" **🔍 Insights** - Summary generation - Skill detection - Experience analysis """) if __name__ == "__main__": main()