Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

App Files Files Community

Social_Media_Data_Extractor_Chatbot / pages /linkedin_extractor.py

Refat81

Update pages/linkedin_extractor.py

f9852cd verified 6 days ago

raw

history blame contribute delete

17.2 kB

	# pages/linkedin_extractor.py
	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	import re
	import time
	import os
	# Add to TOP of each extractor file
	import streamlit as st

	# ============================================
	# AUTHENTICATION CHECK
	# ============================================
	if "authenticated" not in st.session_state or not st.session_state.authenticated:
	st.set_page_config(page_title="Access Denied", page_icon="🔒", layout="centered")

	st.markdown("""
	<style>
	.error-container {
	text-align: center;
	padding: 3rem;
	background: linear-gradient(135deg, #ef4444, #dc2626);
	color: white;
	border-radius: 10px;
	margin: 2rem 0;
	}
	</style>

	<div class="error-container">
	<h1>🔐 Access Denied</h1>
	<p style="font-size: 1.2rem;">Please login to access this page</p>
	</div>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div style="text-align: center; margin-top: 2rem;">
	<a href="/">
	<button style="
	background-color: #4285F4;
	color: white;
	padding: 12px 24px;
	border-radius: 6px;
	border: none;
	font-size: 16px;
	cursor: pointer;
	">
	🔐 Go to Login Page
	</button>
	</a>
	</div>
	""", unsafe_allow_html=True)

	st.stop()

	st.set_page_config(
	page_title="LinkedIn AI Analyzer",
	page_icon="💼",
	layout="wide"
	)

	def enhanced_chat_analysis(user_input, extracted_data):
	"""Enhanced chat analysis with better responses"""
	try:
	if not extracted_data:
	return "❌ No LinkedIn data available. Please extract data first using the sidebar."

	content_blocks = extracted_data.get('content_blocks', [])
	page_info = extracted_data.get('page_info', {})
	data_type = extracted_data.get('data_type', 'profile')

	# Get basic info
	title = page_info.get('title', 'LinkedIn Content')
	total_blocks = len(content_blocks)

	user_input_lower = user_input.lower()

	# Enhanced response patterns
	if any(word in user_input_lower for word in ['what is this', 'what\'s this', 'post about', 'content about']):
	if content_blocks:
	# Get the actual content from the post
	main_content = content_blocks[0] if content_blocks else "No content available"
	return f"""📝 Post Analysis:

	This LinkedIn post is about:

	{main_content}

	The author is sharing their GitHub profile and showcasing projects they've been working on, including:

	• University Information Chatbot - An AI chatbot for university information
	• LinkedIn Data Extractor - A tool for extracting and analyzing LinkedIn data

	This appears to be a professional sharing their technical projects and inviting others to check out their work."""

	elif any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
	if content_blocks:
	main_points = []
	for i, block in enumerate(content_blocks[:3]):
	words = block.split()[:20]
	main_points.append(f"{i+1}. {' '.join(words)}...")

	return f"""📊 Summary

	Title: {title}
	Type: {data_type.title()}
	Content Blocks: {total_blocks}

	Key Content:
	{chr(10).join(main_points)}

	The post showcases technical projects and professional work."""

	elif any(word in user_input_lower for word in ['project', 'github', 'repository']):
	return """🛠️ Projects Mentioned:

	Based on the LinkedIn post, the author is sharing these projects:

	1. University Information Chatbot - An AI-powered chatbot for providing university-related information
	2. LinkedIn Data Extractor - A tool for extracting and analyzing data from LinkedIn profiles

	The author is inviting people to check out their GitHub profile to see these projects."""

	elif any(word in user_input_lower for word in ['skill', 'technology', 'expertise']):
	return """💻 Technical Skills Implied:

	Based on the projects mentioned, the author likely has skills in:

	• Python programming
	• Web development
	• AI/Chatbot development
	• Data extraction/processing
	• API integration
	• GitHub repository management

	These skills are typical for building chatbots and data extraction tools."""

	elif any(word in user_input_lower for word in ['who', 'author', 'person']):
	return f"""👤 About the Author:

	Based on the LinkedIn post:

	Title: {title}

	This appears to be a professional developer/engineer who:
	- Builds AI chatbots and data extraction tools
	- Shares their work on GitHub
	- Is active on LinkedIn for professional networking
	- Works on projects like University Information systems and LinkedIn data analysis"""

	else:
	# FIXED: Using regular string with line breaks instead of triple quotes
	post_preview = content_blocks[0][:200] + '...' if content_blocks else 'No content'
	response_lines = [
	"🤖 Analysis Response:",
	"",
	f"I've analyzed this LinkedIn post for you.",
	"",
	f"Your question: \"{user_input}\"",
	"",
	f"Post Content: {post_preview}",
	"",
	"This appears to be a post where the author is sharing their GitHub profile and showcasing technical projects they've built.",
	"",
	"Try asking:",
	"- \"What projects are mentioned?\"",
	"- \"Tell me about the GitHub profile\"",
	"- \"What is the main purpose of this post?\"",
	"- \"What skills does the author have?\""
	]
	return "\n".join(response_lines)

	except Exception as e:
	return f"❌ Analysis error: {str(e)}"

	def extract_linkedin_data(url, data_type):
	"""Extract data from LinkedIn URLs"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	}

	st.info(f"🌐 Accessing: {url}")
	response = requests.get(url, headers=headers, timeout=25)

	if response.status_code != 200:
	return {
	"error": f"Failed to access page (Status: {response.status_code})",
	"status": "error"
	}

	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove scripts and styles
	for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]):
	script.decompose()

	# Extract and clean text
	text = soup.get_text()
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	clean_text = ' '.join(chunk for chunk in chunks if chunk)

	# Extract meaningful content
	paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 30]

	if not paragraphs:
	return {
	"error": "No meaningful content found. The page might require login or have restricted access.",
	"status": "error"
	}

	# Extract page title
	title = soup.find('title')
	page_title = title.text.strip() if title else "LinkedIn Page"

	# Structure the extracted data
	extracted_data = {
	"page_info": {
	"title": page_title,
	"url": url,
	"response_code": response.status_code,
	"content_length": len(clean_text)
	},
	"content_blocks": paragraphs,
	"extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'),
	"data_type": data_type,
	"status": "success"
	}

	return extracted_data

	except Exception as e:
	return {"error": f"Extraction error: {str(e)}", "status": "error"}

	def display_metrics(extracted_data):
	"""Display extraction metrics"""
	if not extracted_data:
	return

	page_info = extracted_data['page_info']
	content_blocks = extracted_data['content_blocks']

	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Content Blocks", len(content_blocks))

	with col2:
	total_words = sum(len(block.split()) for block in content_blocks)
	st.metric("Total Words", total_words)

	with col3:
	st.metric("Characters", f"{page_info['content_length']:,}")

	with col4:
	st.metric("Response Code", page_info['response_code'])

	def main():
	st.title("💼 LinkedIn AI Analyzer")

	# Initialize session state
	if "extracted_data" not in st.session_state:
	st.session_state.extracted_data = None
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = []
	if "processing" not in st.session_state:
	st.session_state.processing = False
	if "current_url" not in st.session_state:
	st.session_state.current_url = ""
	if "last_user_input" not in st.session_state:
	st.session_state.last_user_input = ""

	# Sidebar
	with st.sidebar:
	st.markdown("### ⚙️ Configuration")

	data_type = st.selectbox("📊 Content Type", ["profile", "company", "post"])

	url_placeholder = {
	"profile": "https://www.linkedin.com/in/username/",
	"company": "https://www.linkedin.com/company/companyname/",
	"post": "https://www.linkedin.com/posts/username_postid/"
	}

	linkedin_url = st.text_input(
	"🌐 LinkedIn URL",
	placeholder=url_placeholder[data_type],
	help="Enter a public LinkedIn URL"
	)

	# Quick test URLs
	st.markdown("### 🚀 Quick Test")
	test_urls = {
	"Microsoft": "https://www.linkedin.com/company/microsoft/",
	"Google": "https://www.linkedin.com/company/google/",
	"Apple": "https://www.linkedin.com/company/apple/",
	}

	for name, url in test_urls.items():
	if st.button(f"🏢 {name}", key=name, use_container_width=True):
	st.session_state.current_url = url
	st.rerun()

	# Extract button
	if st.button("🚀 Extract & Analyze", type="primary", use_container_width=True):
	url_to_use = linkedin_url.strip() or st.session_state.current_url

	if not url_to_use:
	st.warning("⚠️ Please enter a LinkedIn URL")
	elif not url_to_use.startswith('https://www.linkedin.com/'):
	st.error("❌ Please enter a valid LinkedIn URL")
	else:
	st.session_state.processing = True
	with st.spinner("🔄 Extracting LinkedIn data..."):
	extracted_data = extract_linkedin_data(url_to_use, data_type)

	if extracted_data.get("status") == "success":
	st.session_state.extracted_data = extracted_data
	st.session_state.current_url = url_to_use
	st.session_state.chat_history = []
	st.session_state.last_user_input = ""
	st.success("✅ Data extracted successfully!")
	st.balloons()
	else:
	error_msg = extracted_data.get("error", "Unknown error")
	st.error(f"❌ Extraction failed: {error_msg}")

	st.session_state.processing = False

	# Chat management
	if st.session_state.extracted_data:
	st.markdown("---")
	st.subheader("💬 Chat Management")
	if st.button("🗑️ Clear Chat", type="secondary", use_container_width=True):
	st.session_state.chat_history = []
	st.session_state.last_user_input = ""
	st.success("🗑️ Chat history cleared!")

	# Main content area
	st.markdown("### 📊 Extraction Results")

	if st.session_state.processing:
	st.info("🔄 Processing LinkedIn data...")

	elif st.session_state.extracted_data:
	data = st.session_state.extracted_data
	page_info = data['page_info']
	content_blocks = data['content_blocks']

	st.success("✅ Extraction Complete")

	# Display metrics
	display_metrics(data)

	# Display page info and sample content in columns
	col1, col2 = st.columns(2)

	with col1:
	st.markdown("#### 🏷️ Page Information")
	st.write(f"Title: {page_info['title']}")
	st.write(f"URL: {page_info['url']}")
	st.write(f"Type: {data['data_type'].title()}")
	st.write(f"Content Blocks: {len(content_blocks)}")
	st.write(f"Extracted: {data['extraction_time']}")

	with col2:
	st.markdown("#### 📝 Sample Content")
	for i, block in enumerate(content_blocks[:3]):
	with st.expander(f"Block {i+1} ({len(block.split())} words)"):
	st.write(block)

	if len(content_blocks) > 3:
	st.info(f"📄 +{len(content_blocks) - 3} more blocks")

	else:
	st.info("""
	👋 Welcome to LinkedIn AI Analyzer!

	To get started:
	1. Select content type in sidebar
	2. Enter a LinkedIn URL or click suggested company
	3. Click "Extract & Analyze"
	4. Chat with the AI below about the extracted content

	Supported URLs:
	- 👤 Public Profiles
	- 🏢 Company Pages
	- 📝 Public Posts
	""")

	# Chat section
	st.markdown("---")
	st.markdown("### 💬 Chat with AI")

	has_data = st.session_state.extracted_data and st.session_state.extracted_data.get("status") == "success"

	if has_data:
	st.success("💬 Chat ready! Ask questions about the LinkedIn data below.")

	# Display chat history
	for chat in st.session_state.chat_history:
	if chat["role"] == "user":
	with st.chat_message("user"):
	st.write(chat['content'])
	elif chat["role"] == "assistant":
	with st.chat_message("assistant"):
	st.write(chat['content'])

	# Suggested questions when no history
	if len(st.session_state.chat_history) == 0:
	st.markdown("#### 💡 Try asking:")
	suggestions = [
	"What is this post about?",
	"Summarize this content",
	"What projects are mentioned?",
	"Tell me about the GitHub profile"
	]

	cols = st.columns(len(suggestions))
	for i, suggestion in enumerate(suggestions):
	with cols[i]:
	if st.button(suggestion, key=f"sugg_{i}", use_container_width=True):
	st.info(f"💡 Type: '{suggestion}' in the chat below")

	# CHAT INPUT
	if has_data:
	user_input = st.chat_input("Type your question about the LinkedIn data here...")

	if user_input and user_input != st.session_state.last_user_input:
	st.session_state.last_user_input = user_input
	st.session_state.chat_history.append({"role": "user", "content": user_input})

	with st.spinner("🤔 Analyzing..."):
	response = enhanced_chat_analysis(user_input, st.session_state.extracted_data)
	st.session_state.chat_history.append({"role": "assistant", "content": response})

	st.rerun()

	# Features section
	st.markdown("---")
	st.markdown("### 🚀 Features")

	feature_cols = st.columns(3)

	with feature_cols[0]:
	st.markdown("""
	📊 Data Extraction
	- LinkedIn content scraping
	- Text processing
	- Content analysis
	""")

	with feature_cols[1]:
	st.markdown("""
	💬 Smart Chat
	- Interactive Q&A
	- Content analysis
	- Professional insights
	""")

	with feature_cols[2]:
	st.markdown("""
	🔍 Insights
	- Summary generation
	- Skill detection
	- Experience analysis
	""")

	if __name__ == "__main__":
	main()