Refat81 commited on
Commit
e3795ec
·
verified ·
1 Parent(s): 095b424

Update pages/linkedin_extractor.py

Browse files
Files changed (1) hide show
  1. pages/linkedin_extractor.py +0 -425
pages/linkedin_extractor.py CHANGED
@@ -1,425 +0,0 @@
1
- # pages/linkedin_extractor.py
2
- import streamlit as st
3
- import requests
4
- from bs4 import BeautifulSoup
5
- import re
6
- import time
7
- import os
8
-
9
- st.set_page_config(
10
- page_title="LinkedIn AI Analyzer",
11
- page_icon="💼",
12
- layout="wide"
13
- )
14
-
15
- def enhanced_chat_analysis(user_input, extracted_data):
16
- """Enhanced chat analysis with better responses"""
17
- try:
18
- if not extracted_data:
19
- return "❌ No LinkedIn data available. Please extract data first using the sidebar."
20
-
21
- content_blocks = extracted_data.get('content_blocks', [])
22
- page_info = extracted_data.get('page_info', {})
23
- data_type = extracted_data.get('data_type', 'profile')
24
-
25
- # Get basic info
26
- title = page_info.get('title', 'LinkedIn Content')
27
- total_blocks = len(content_blocks)
28
-
29
- user_input_lower = user_input.lower()
30
-
31
- # Enhanced response patterns
32
- if any(word in user_input_lower for word in ['what is this', 'what\'s this', 'post about', 'content about']):
33
- if content_blocks:
34
- # Get the actual content from the post
35
- main_content = content_blocks[0] if content_blocks else "No content available"
36
- return f"""**📝 Post Analysis:**
37
-
38
- This LinkedIn post is about:
39
-
40
- **{main_content}**
41
-
42
- The author is sharing their GitHub profile and showcasing projects they've been working on, including:
43
-
44
- • **University Information Chatbot** - An AI chatbot for university information
45
- • **LinkedIn Data Extractor** - A tool for extracting and analyzing LinkedIn data
46
-
47
- This appears to be a professional sharing their technical projects and inviting others to check out their work."""
48
-
49
- elif any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
50
- if content_blocks:
51
- main_points = []
52
- for i, block in enumerate(content_blocks[:3]):
53
- words = block.split()[:20]
54
- main_points.append(f"{i+1}. {' '.join(words)}...")
55
-
56
- return f"""**📊 Summary**
57
-
58
- **Title:** {title}
59
- **Type:** {data_type.title()}
60
- **Content Blocks:** {total_blocks}
61
-
62
- **Key Content:**
63
- {chr(10).join(main_points)}
64
-
65
- The post showcases technical projects and professional work."""
66
-
67
- elif any(word in user_input_lower for word in ['project', 'github', 'repository']):
68
- return """**🛠️ Projects Mentioned:**
69
-
70
- Based on the LinkedIn post, the author is sharing these projects:
71
-
72
- 1. **University Information Chatbot** - An AI-powered chatbot for providing university-related information
73
- 2. **LinkedIn Data Extractor** - A tool for extracting and analyzing data from LinkedIn profiles
74
-
75
- The author is inviting people to check out their GitHub profile to see these projects."""
76
-
77
- elif any(word in user_input_lower for word in ['skill', 'technology', 'expertise']):
78
- return """**💻 Technical Skills Implied:**
79
-
80
- Based on the projects mentioned, the author likely has skills in:
81
-
82
- • Python programming
83
- • Web development
84
- • AI/Chatbot development
85
- • Data extraction/processing
86
- • API integration
87
- • GitHub repository management
88
-
89
- These skills are typical for building chatbots and data extraction tools."""
90
-
91
- elif any(word in user_input_lower for word in ['who', 'author', 'person']):
92
- return f"""**👤 About the Author:**
93
-
94
- Based on the LinkedIn post:
95
-
96
- **Title:** {title}
97
-
98
- This appears to be a professional developer/engineer who:
99
- - Builds AI chatbots and data extraction tools
100
- - Shares their work on GitHub
101
- - Is active on LinkedIn for professional networking
102
- - Works on projects like University Information systems and LinkedIn data analysis"""
103
-
104
- else:
105
- return f"""**🤖 Analysis Response:**
106
-
107
- I've analyzed this LinkedIn post for you.
108
-
109
- **Your question:** "{user_input}"
110
-
111
- **Post Content:** {content_blocks[0][:200] + '...' if content_blocks else 'No content'}
112
-
113
- This appears to be a post where the author is sharing their GitHub profile and showcasing technical projects they've built.
114
-
115
- **Try asking:**
116
- - "What projects are mentioned?"
117
- - "Tell me about the GitHub profile"
118
- - "What is the main purpose of this post?"
119
- - "What skills does the author have?""""
120
-
121
- except Exception as e:
122
- return f"❌ Analysis error: {str(e)}"
123
-
124
- def extract_linkedin_data(url, data_type):
125
- """Extract data from LinkedIn URLs"""
126
- try:
127
- headers = {
128
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
129
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
130
- }
131
-
132
- st.info(f"🌐 Accessing: {url}")
133
- response = requests.get(url, headers=headers, timeout=25)
134
-
135
- if response.status_code != 200:
136
- return {
137
- "error": f"Failed to access page (Status: {response.status_code})",
138
- "status": "error"
139
- }
140
-
141
- soup = BeautifulSoup(response.text, 'html.parser')
142
-
143
- # Remove scripts and styles
144
- for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]):
145
- script.decompose()
146
-
147
- # Extract and clean text
148
- text = soup.get_text()
149
- lines = (line.strip() for line in text.splitlines())
150
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
151
- clean_text = ' '.join(chunk for chunk in chunks if chunk)
152
-
153
- # Extract meaningful content
154
- paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 30]
155
-
156
- if not paragraphs:
157
- return {
158
- "error": "No meaningful content found. The page might require login or have restricted access.",
159
- "status": "error"
160
- }
161
-
162
- # Extract page title
163
- title = soup.find('title')
164
- page_title = title.text.strip() if title else "LinkedIn Page"
165
-
166
- # Structure the extracted data
167
- extracted_data = {
168
- "page_info": {
169
- "title": page_title,
170
- "url": url,
171
- "response_code": response.status_code,
172
- "content_length": len(clean_text)
173
- },
174
- "content_blocks": paragraphs,
175
- "extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'),
176
- "data_type": data_type,
177
- "status": "success"
178
- }
179
-
180
- return extracted_data
181
-
182
- except Exception as e:
183
- return {"error": f"Extraction error: {str(e)}", "status": "error"}
184
-
185
- def display_metrics(extracted_data):
186
- """Display extraction metrics"""
187
- if not extracted_data:
188
- return
189
-
190
- page_info = extracted_data['page_info']
191
- content_blocks = extracted_data['content_blocks']
192
-
193
- col1, col2, col3, col4 = st.columns(4)
194
-
195
- with col1:
196
- st.metric("Content Blocks", len(content_blocks))
197
-
198
- with col2:
199
- total_words = sum(len(block.split()) for block in content_blocks)
200
- st.metric("Total Words", total_words)
201
-
202
- with col3:
203
- st.metric("Characters", f"{page_info['content_length']:,}")
204
-
205
- with col4:
206
- st.metric("Response Code", page_info['response_code'])
207
-
208
- def main():
209
- st.title("💼 LinkedIn AI Analyzer")
210
-
211
- # Initialize session state - CRITICAL FIX
212
- if "extracted_data" not in st.session_state:
213
- st.session_state.extracted_data = None
214
- if "chat_history" not in st.session_state:
215
- st.session_state.chat_history = []
216
- if "processing" not in st.session_state:
217
- st.session_state.processing = False
218
- if "current_url" not in st.session_state:
219
- st.session_state.current_url = ""
220
- if "last_user_input" not in st.session_state:
221
- st.session_state.last_user_input = ""
222
-
223
- # Sidebar
224
- with st.sidebar:
225
- st.markdown("### ⚙️ Configuration")
226
-
227
- data_type = st.selectbox("📊 Content Type", ["profile", "company", "post"])
228
-
229
- url_placeholder = {
230
- "profile": "https://www.linkedin.com/in/username/",
231
- "company": "https://www.linkedin.com/company/companyname/",
232
- "post": "https://www.linkedin.com/posts/username_postid/"
233
- }
234
-
235
- linkedin_url = st.text_input(
236
- "🌐 LinkedIn URL",
237
- placeholder=url_placeholder[data_type],
238
- help="Enter a public LinkedIn URL"
239
- )
240
-
241
- # Quick test URLs
242
- st.markdown("### 🚀 Quick Test")
243
- test_urls = {
244
- "Microsoft": "https://www.linkedin.com/company/microsoft/",
245
- "Google": "https://www.linkedin.com/company/google/",
246
- "Apple": "https://www.linkedin.com/company/apple/",
247
- }
248
-
249
- for name, url in test_urls.items():
250
- if st.button(f"🏢 {name}", key=name, use_container_width=True):
251
- st.session_state.current_url = url
252
- st.rerun()
253
-
254
- # Extract button
255
- if st.button("🚀 Extract & Analyze", type="primary", use_container_width=True):
256
- url_to_use = linkedin_url.strip() or st.session_state.current_url
257
-
258
- if not url_to_use:
259
- st.warning("⚠️ Please enter a LinkedIn URL")
260
- elif not url_to_use.startswith('https://www.linkedin.com/'):
261
- st.error("❌ Please enter a valid LinkedIn URL")
262
- else:
263
- st.session_state.processing = True
264
- with st.spinner("🔄 Extracting LinkedIn data..."):
265
- extracted_data = extract_linkedin_data(url_to_use, data_type)
266
-
267
- if extracted_data.get("status") == "success":
268
- st.session_state.extracted_data = extracted_data
269
- st.session_state.current_url = url_to_use
270
- st.session_state.chat_history = [] # Clear previous chat
271
- st.session_state.last_user_input = "" # Reset last input
272
- st.success("✅ Data extracted successfully!")
273
- st.balloons()
274
- else:
275
- error_msg = extracted_data.get("error", "Unknown error")
276
- st.error(f"❌ Extraction failed: {error_msg}")
277
-
278
- st.session_state.processing = False
279
-
280
- # Chat management
281
- if st.session_state.extracted_data:
282
- st.markdown("---")
283
- st.subheader("💬 Chat Management")
284
- if st.button("🗑️ Clear Chat", type="secondary", use_container_width=True):
285
- st.session_state.chat_history = []
286
- st.session_state.last_user_input = ""
287
- st.success("🗑️ Chat history cleared!")
288
-
289
- # Main content area
290
- st.markdown("### 📊 Extraction Results")
291
-
292
- if st.session_state.processing:
293
- st.info("🔄 Processing LinkedIn data...")
294
-
295
- elif st.session_state.extracted_data:
296
- data = st.session_state.extracted_data
297
- page_info = data['page_info']
298
- content_blocks = data['content_blocks']
299
-
300
- st.success("✅ Extraction Complete")
301
-
302
- # Display metrics
303
- display_metrics(data)
304
-
305
- # Display page info and sample content in columns
306
- col1, col2 = st.columns(2)
307
-
308
- with col1:
309
- st.markdown("#### 🏷️ Page Information")
310
- st.write(f"**Title:** {page_info['title']}")
311
- st.write(f"**URL:** {page_info['url']}")
312
- st.write(f"**Type:** {data['data_type'].title()}")
313
- st.write(f"**Content Blocks:** {len(content_blocks)}")
314
- st.write(f"**Extracted:** {data['extraction_time']}")
315
-
316
- with col2:
317
- st.markdown("#### 📝 Sample Content")
318
- for i, block in enumerate(content_blocks[:3]):
319
- with st.expander(f"Block {i+1} ({len(block.split())} words)"):
320
- st.write(block)
321
-
322
- if len(content_blocks) > 3:
323
- st.info(f"📄 +{len(content_blocks) - 3} more blocks")
324
-
325
- else:
326
- st.info("""
327
- 👋 **Welcome to LinkedIn AI Analyzer!**
328
-
329
- **To get started:**
330
- 1. Select content type in sidebar
331
- 2. Enter a LinkedIn URL or click suggested company
332
- 3. Click "Extract & Analyze"
333
- 4. Chat with the AI below about the extracted content
334
-
335
- **Supported URLs:**
336
- - 👤 Public Profiles
337
- - 🏢 Company Pages
338
- - 📝 Public Posts
339
- """)
340
-
341
- # Chat section
342
- st.markdown("---")
343
- st.markdown("### 💬 Chat with AI")
344
-
345
- has_data = st.session_state.extracted_data and st.session_state.extracted_data.get("status") == "success"
346
-
347
- if has_data:
348
- st.success("💬 Chat ready! Ask questions about the LinkedIn data below.")
349
-
350
- # Display chat history - ONLY ONCE
351
- for chat in st.session_state.chat_history:
352
- if chat["role"] == "user":
353
- with st.chat_message("user"):
354
- st.write(chat['content'])
355
- elif chat["role"] == "assistant":
356
- with st.chat_message("assistant"):
357
- st.write(chat['content'])
358
-
359
- # Suggested questions when no history
360
- if len(st.session_state.chat_history) == 0:
361
- st.markdown("#### 💡 Try asking:")
362
- suggestions = [
363
- "What is this post about?",
364
- "Summarize this content",
365
- "What projects are mentioned?",
366
- "Tell me about the GitHub profile"
367
- ]
368
-
369
- cols = st.columns(len(suggestions))
370
- for i, suggestion in enumerate(suggestions):
371
- with cols[i]:
372
- if st.button(suggestion, key=f"sugg_{i}", use_container_width=True):
373
- st.info(f"💡 Type: '{suggestion}' in the chat below")
374
-
375
- # CHAT INPUT - WITH DUPLICATION PROTECTION
376
- if has_data:
377
- user_input = st.chat_input("Type your question about the LinkedIn data here...")
378
-
379
- if user_input and user_input != st.session_state.last_user_input:
380
- # Store the current input to prevent duplication
381
- st.session_state.last_user_input = user_input
382
-
383
- # Add user message
384
- st.session_state.chat_history.append({"role": "user", "content": user_input})
385
-
386
- # Generate and add AI response
387
- with st.spinner("🤔 Analyzing..."):
388
- response = enhanced_chat_analysis(user_input, st.session_state.extracted_data)
389
- st.session_state.chat_history.append({"role": "assistant", "content": response})
390
-
391
- # Force rerun to show updated chat
392
- st.rerun()
393
-
394
- # Features section at bottom
395
- st.markdown("---")
396
- st.markdown("### 🚀 Features")
397
-
398
- feature_cols = st.columns(3)
399
-
400
- with feature_cols[0]:
401
- st.markdown("""
402
- **📊 Data Extraction**
403
- - LinkedIn content scraping
404
- - Text processing
405
- - Content analysis
406
- """)
407
-
408
- with feature_cols[1]:
409
- st.markdown("""
410
- **💬 Smart Chat**
411
- - Interactive Q&A
412
- - Content analysis
413
- - Professional insights
414
- """)
415
-
416
- with feature_cols[2]:
417
- st.markdown("""
418
- **🔍 Insights**
419
- - Summary generation
420
- - Skill detection
421
- - Experience analysis
422
- """)
423
-
424
- if __name__ == "__main__":
425
- main()