# pages/linkedin_extractor.py
import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
import time
import os
# Add to TOP of each extractor file
import streamlit as st
# ============================================
# AUTHENTICATION CHECK
# ============================================
if "authenticated" not in st.session_state or not st.session_state.authenticated:
st.set_page_config(page_title="Access Denied", page_icon="🔒", layout="centered")
st.markdown("""
🔐 Access Denied
Please login to access this page
""", unsafe_allow_html=True)
st.markdown("""
""", unsafe_allow_html=True)
st.stop()
st.set_page_config(
page_title="LinkedIn AI Analyzer",
page_icon="💼",
layout="wide"
)
def enhanced_chat_analysis(user_input, extracted_data):
"""Enhanced chat analysis with better responses"""
try:
if not extracted_data:
return "❌ No LinkedIn data available. Please extract data first using the sidebar."
content_blocks = extracted_data.get('content_blocks', [])
page_info = extracted_data.get('page_info', {})
data_type = extracted_data.get('data_type', 'profile')
# Get basic info
title = page_info.get('title', 'LinkedIn Content')
total_blocks = len(content_blocks)
user_input_lower = user_input.lower()
# Enhanced response patterns
if any(word in user_input_lower for word in ['what is this', 'what\'s this', 'post about', 'content about']):
if content_blocks:
# Get the actual content from the post
main_content = content_blocks[0] if content_blocks else "No content available"
return f"""**📝 Post Analysis:**
This LinkedIn post is about:
**{main_content}**
The author is sharing their GitHub profile and showcasing projects they've been working on, including:
• **University Information Chatbot** - An AI chatbot for university information
• **LinkedIn Data Extractor** - A tool for extracting and analyzing LinkedIn data
This appears to be a professional sharing their technical projects and inviting others to check out their work."""
elif any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
if content_blocks:
main_points = []
for i, block in enumerate(content_blocks[:3]):
words = block.split()[:20]
main_points.append(f"{i+1}. {' '.join(words)}...")
return f"""**📊 Summary**
**Title:** {title}
**Type:** {data_type.title()}
**Content Blocks:** {total_blocks}
**Key Content:**
{chr(10).join(main_points)}
The post showcases technical projects and professional work."""
elif any(word in user_input_lower for word in ['project', 'github', 'repository']):
return """**🛠️ Projects Mentioned:**
Based on the LinkedIn post, the author is sharing these projects:
1. **University Information Chatbot** - An AI-powered chatbot for providing university-related information
2. **LinkedIn Data Extractor** - A tool for extracting and analyzing data from LinkedIn profiles
The author is inviting people to check out their GitHub profile to see these projects."""
elif any(word in user_input_lower for word in ['skill', 'technology', 'expertise']):
return """**💻 Technical Skills Implied:**
Based on the projects mentioned, the author likely has skills in:
• Python programming
• Web development
• AI/Chatbot development
• Data extraction/processing
• API integration
• GitHub repository management
These skills are typical for building chatbots and data extraction tools."""
elif any(word in user_input_lower for word in ['who', 'author', 'person']):
return f"""**👤 About the Author:**
Based on the LinkedIn post:
**Title:** {title}
This appears to be a professional developer/engineer who:
- Builds AI chatbots and data extraction tools
- Shares their work on GitHub
- Is active on LinkedIn for professional networking
- Works on projects like University Information systems and LinkedIn data analysis"""
else:
# FIXED: Using regular string with line breaks instead of triple quotes
post_preview = content_blocks[0][:200] + '...' if content_blocks else 'No content'
response_lines = [
"**🤖 Analysis Response:**",
"",
f"I've analyzed this LinkedIn post for you.",
"",
f"**Your question:** \"{user_input}\"",
"",
f"**Post Content:** {post_preview}",
"",
"This appears to be a post where the author is sharing their GitHub profile and showcasing technical projects they've built.",
"",
"**Try asking:**",
"- \"What projects are mentioned?\"",
"- \"Tell me about the GitHub profile\"",
"- \"What is the main purpose of this post?\"",
"- \"What skills does the author have?\""
]
return "\n".join(response_lines)
except Exception as e:
return f"❌ Analysis error: {str(e)}"
def extract_linkedin_data(url, data_type):
"""Extract data from LinkedIn URLs"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
}
st.info(f"🌐 Accessing: {url}")
response = requests.get(url, headers=headers, timeout=25)
if response.status_code != 200:
return {
"error": f"Failed to access page (Status: {response.status_code})",
"status": "error"
}
soup = BeautifulSoup(response.text, 'html.parser')
# Remove scripts and styles
for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]):
script.decompose()
# Extract and clean text
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
clean_text = ' '.join(chunk for chunk in chunks if chunk)
# Extract meaningful content
paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 30]
if not paragraphs:
return {
"error": "No meaningful content found. The page might require login or have restricted access.",
"status": "error"
}
# Extract page title
title = soup.find('title')
page_title = title.text.strip() if title else "LinkedIn Page"
# Structure the extracted data
extracted_data = {
"page_info": {
"title": page_title,
"url": url,
"response_code": response.status_code,
"content_length": len(clean_text)
},
"content_blocks": paragraphs,
"extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'),
"data_type": data_type,
"status": "success"
}
return extracted_data
except Exception as e:
return {"error": f"Extraction error: {str(e)}", "status": "error"}
def display_metrics(extracted_data):
"""Display extraction metrics"""
if not extracted_data:
return
page_info = extracted_data['page_info']
content_blocks = extracted_data['content_blocks']
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Content Blocks", len(content_blocks))
with col2:
total_words = sum(len(block.split()) for block in content_blocks)
st.metric("Total Words", total_words)
with col3:
st.metric("Characters", f"{page_info['content_length']:,}")
with col4:
st.metric("Response Code", page_info['response_code'])
def main():
st.title("💼 LinkedIn AI Analyzer")
# Initialize session state
if "extracted_data" not in st.session_state:
st.session_state.extracted_data = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
if "processing" not in st.session_state:
st.session_state.processing = False
if "current_url" not in st.session_state:
st.session_state.current_url = ""
if "last_user_input" not in st.session_state:
st.session_state.last_user_input = ""
# Sidebar
with st.sidebar:
st.markdown("### ⚙️ Configuration")
data_type = st.selectbox("📊 Content Type", ["profile", "company", "post"])
url_placeholder = {
"profile": "https://www.linkedin.com/in/username/",
"company": "https://www.linkedin.com/company/companyname/",
"post": "https://www.linkedin.com/posts/username_postid/"
}
linkedin_url = st.text_input(
"🌐 LinkedIn URL",
placeholder=url_placeholder[data_type],
help="Enter a public LinkedIn URL"
)
# Quick test URLs
st.markdown("### 🚀 Quick Test")
test_urls = {
"Microsoft": "https://www.linkedin.com/company/microsoft/",
"Google": "https://www.linkedin.com/company/google/",
"Apple": "https://www.linkedin.com/company/apple/",
}
for name, url in test_urls.items():
if st.button(f"🏢 {name}", key=name, use_container_width=True):
st.session_state.current_url = url
st.rerun()
# Extract button
if st.button("🚀 Extract & Analyze", type="primary", use_container_width=True):
url_to_use = linkedin_url.strip() or st.session_state.current_url
if not url_to_use:
st.warning("⚠️ Please enter a LinkedIn URL")
elif not url_to_use.startswith('https://www.linkedin.com/'):
st.error("❌ Please enter a valid LinkedIn URL")
else:
st.session_state.processing = True
with st.spinner("🔄 Extracting LinkedIn data..."):
extracted_data = extract_linkedin_data(url_to_use, data_type)
if extracted_data.get("status") == "success":
st.session_state.extracted_data = extracted_data
st.session_state.current_url = url_to_use
st.session_state.chat_history = []
st.session_state.last_user_input = ""
st.success("✅ Data extracted successfully!")
st.balloons()
else:
error_msg = extracted_data.get("error", "Unknown error")
st.error(f"❌ Extraction failed: {error_msg}")
st.session_state.processing = False
# Chat management
if st.session_state.extracted_data:
st.markdown("---")
st.subheader("💬 Chat Management")
if st.button("🗑️ Clear Chat", type="secondary", use_container_width=True):
st.session_state.chat_history = []
st.session_state.last_user_input = ""
st.success("🗑️ Chat history cleared!")
# Main content area
st.markdown("### 📊 Extraction Results")
if st.session_state.processing:
st.info("🔄 Processing LinkedIn data...")
elif st.session_state.extracted_data:
data = st.session_state.extracted_data
page_info = data['page_info']
content_blocks = data['content_blocks']
st.success("✅ Extraction Complete")
# Display metrics
display_metrics(data)
# Display page info and sample content in columns
col1, col2 = st.columns(2)
with col1:
st.markdown("#### 🏷️ Page Information")
st.write(f"**Title:** {page_info['title']}")
st.write(f"**URL:** {page_info['url']}")
st.write(f"**Type:** {data['data_type'].title()}")
st.write(f"**Content Blocks:** {len(content_blocks)}")
st.write(f"**Extracted:** {data['extraction_time']}")
with col2:
st.markdown("#### 📝 Sample Content")
for i, block in enumerate(content_blocks[:3]):
with st.expander(f"Block {i+1} ({len(block.split())} words)"):
st.write(block)
if len(content_blocks) > 3:
st.info(f"📄 +{len(content_blocks) - 3} more blocks")
else:
st.info("""
👋 **Welcome to LinkedIn AI Analyzer!**
**To get started:**
1. Select content type in sidebar
2. Enter a LinkedIn URL or click suggested company
3. Click "Extract & Analyze"
4. Chat with the AI below about the extracted content
**Supported URLs:**
- 👤 Public Profiles
- 🏢 Company Pages
- 📝 Public Posts
""")
# Chat section
st.markdown("---")
st.markdown("### 💬 Chat with AI")
has_data = st.session_state.extracted_data and st.session_state.extracted_data.get("status") == "success"
if has_data:
st.success("💬 Chat ready! Ask questions about the LinkedIn data below.")
# Display chat history
for chat in st.session_state.chat_history:
if chat["role"] == "user":
with st.chat_message("user"):
st.write(chat['content'])
elif chat["role"] == "assistant":
with st.chat_message("assistant"):
st.write(chat['content'])
# Suggested questions when no history
if len(st.session_state.chat_history) == 0:
st.markdown("#### 💡 Try asking:")
suggestions = [
"What is this post about?",
"Summarize this content",
"What projects are mentioned?",
"Tell me about the GitHub profile"
]
cols = st.columns(len(suggestions))
for i, suggestion in enumerate(suggestions):
with cols[i]:
if st.button(suggestion, key=f"sugg_{i}", use_container_width=True):
st.info(f"💡 Type: '{suggestion}' in the chat below")
# CHAT INPUT
if has_data:
user_input = st.chat_input("Type your question about the LinkedIn data here...")
if user_input and user_input != st.session_state.last_user_input:
st.session_state.last_user_input = user_input
st.session_state.chat_history.append({"role": "user", "content": user_input})
with st.spinner("🤔 Analyzing..."):
response = enhanced_chat_analysis(user_input, st.session_state.extracted_data)
st.session_state.chat_history.append({"role": "assistant", "content": response})
st.rerun()
# Features section
st.markdown("---")
st.markdown("### 🚀 Features")
feature_cols = st.columns(3)
with feature_cols[0]:
st.markdown("""
**📊 Data Extraction**
- LinkedIn content scraping
- Text processing
- Content analysis
""")
with feature_cols[1]:
st.markdown("""
**💬 Smart Chat**
- Interactive Q&A
- Content analysis
- Professional insights
""")
with feature_cols[2]:
st.markdown("""
**🔍 Insights**
- Summary generation
- Skill detection
- Experience analysis
""")
if __name__ == "__main__":
main()