Refat81 commited on
Commit
8c65300
Β·
verified Β·
1 Parent(s): e3795ec

Update pages/linkedin_extractor.py

Browse files
Files changed (1) hide show
  1. pages/linkedin_extractor.py +425 -0
pages/linkedin_extractor.py CHANGED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pages/linkedin_extractor.py
2
+ import streamlit as st
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import re
6
+ import time
7
+ import os
8
+
9
+ st.set_page_config(
10
+ page_title="LinkedIn AI Analyzer",
11
+ page_icon="πŸ’Ό",
12
+ layout="wide"
13
+ )
14
+
15
+ def enhanced_chat_analysis(user_input, extracted_data):
16
+ """Enhanced chat analysis with better responses"""
17
+ try:
18
+ if not extracted_data:
19
+ return "❌ No LinkedIn data available. Please extract data first using the sidebar."
20
+
21
+ content_blocks = extracted_data.get('content_blocks', [])
22
+ page_info = extracted_data.get('page_info', {})
23
+ data_type = extracted_data.get('data_type', 'profile')
24
+
25
+ # Get basic info
26
+ title = page_info.get('title', 'LinkedIn Content')
27
+ total_blocks = len(content_blocks)
28
+
29
+ user_input_lower = user_input.lower()
30
+
31
+ # Enhanced response patterns
32
+ if any(word in user_input_lower for word in ['what is this', 'what\'s this', 'post about', 'content about']):
33
+ if content_blocks:
34
+ # Get the actual content from the post
35
+ main_content = content_blocks[0] if content_blocks else "No content available"
36
+ return f"""**πŸ“ Post Analysis:**
37
+
38
+ This LinkedIn post is about:
39
+
40
+ **{main_content}**
41
+
42
+ The author is sharing their GitHub profile and showcasing projects they've been working on, including:
43
+
44
+ β€’ **University Information Chatbot** - An AI chatbot for university information
45
+ β€’ **LinkedIn Data Extractor** - A tool for extracting and analyzing LinkedIn data
46
+
47
+ This appears to be a professional sharing their technical projects and inviting others to check out their work."""
48
+
49
+ elif any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
50
+ if content_blocks:
51
+ main_points = []
52
+ for i, block in enumerate(content_blocks[:3]):
53
+ words = block.split()[:20]
54
+ main_points.append(f"{i+1}. {' '.join(words)}...")
55
+
56
+ return f"""**πŸ“Š Summary**
57
+
58
+ **Title:** {title}
59
+ **Type:** {data_type.title()}
60
+ **Content Blocks:** {total_blocks}
61
+
62
+ **Key Content:**
63
+ {chr(10).join(main_points)}
64
+
65
+ The post showcases technical projects and professional work."""
66
+
67
+ elif any(word in user_input_lower for word in ['project', 'github', 'repository']):
68
+ return """**πŸ› οΈ Projects Mentioned:**
69
+
70
+ Based on the LinkedIn post, the author is sharing these projects:
71
+
72
+ 1. **University Information Chatbot** - An AI-powered chatbot for providing university-related information
73
+ 2. **LinkedIn Data Extractor** - A tool for extracting and analyzing data from LinkedIn profiles
74
+
75
+ The author is inviting people to check out their GitHub profile to see these projects."""
76
+
77
+ elif any(word in user_input_lower for word in ['skill', 'technology', 'expertise']):
78
+ return """**πŸ’» Technical Skills Implied:**
79
+
80
+ Based on the projects mentioned, the author likely has skills in:
81
+
82
+ β€’ Python programming
83
+ β€’ Web development
84
+ β€’ AI/Chatbot development
85
+ β€’ Data extraction/processing
86
+ β€’ API integration
87
+ β€’ GitHub repository management
88
+
89
+ These skills are typical for building chatbots and data extraction tools."""
90
+
91
+ elif any(word in user_input_lower for word in ['who', 'author', 'person']):
92
+ return f"""**πŸ‘€ About the Author:**
93
+
94
+ Based on the LinkedIn post:
95
+
96
+ **Title:** {title}
97
+
98
+ This appears to be a professional developer/engineer who:
99
+ - Builds AI chatbots and data extraction tools
100
+ - Shares their work on GitHub
101
+ - Is active on LinkedIn for professional networking
102
+ - Works on projects like University Information systems and LinkedIn data analysis"""
103
+
104
+ else:
105
+ return f"""**πŸ€– Analysis Response:**
106
+
107
+ I've analyzed this LinkedIn post for you.
108
+
109
+ **Your question:** "{user_input}"
110
+
111
+ **Post Content:** {content_blocks[0][:200] + '...' if content_blocks else 'No content'}
112
+
113
+ This appears to be a post where the author is sharing their GitHub profile and showcasing technical projects they've built.
114
+
115
+ **Try asking:**
116
+ - "What projects are mentioned?"
117
+ - "Tell me about the GitHub profile"
118
+ - "What is the main purpose of this post?"
119
+ - "What skills does the author have?""""
120
+
121
+ except Exception as e:
122
+ return f"❌ Analysis error: {str(e)}"
123
+
124
+ def extract_linkedin_data(url, data_type):
125
+ """Extract data from LinkedIn URLs"""
126
+ try:
127
+ headers = {
128
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
129
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
130
+ }
131
+
132
+ st.info(f"🌐 Accessing: {url}")
133
+ response = requests.get(url, headers=headers, timeout=25)
134
+
135
+ if response.status_code != 200:
136
+ return {
137
+ "error": f"Failed to access page (Status: {response.status_code})",
138
+ "status": "error"
139
+ }
140
+
141
+ soup = BeautifulSoup(response.text, 'html.parser')
142
+
143
+ # Remove scripts and styles
144
+ for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]):
145
+ script.decompose()
146
+
147
+ # Extract and clean text
148
+ text = soup.get_text()
149
+ lines = (line.strip() for line in text.splitlines())
150
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
151
+ clean_text = ' '.join(chunk for chunk in chunks if chunk)
152
+
153
+ # Extract meaningful content
154
+ paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 30]
155
+
156
+ if not paragraphs:
157
+ return {
158
+ "error": "No meaningful content found. The page might require login or have restricted access.",
159
+ "status": "error"
160
+ }
161
+
162
+ # Extract page title
163
+ title = soup.find('title')
164
+ page_title = title.text.strip() if title else "LinkedIn Page"
165
+
166
+ # Structure the extracted data
167
+ extracted_data = {
168
+ "page_info": {
169
+ "title": page_title,
170
+ "url": url,
171
+ "response_code": response.status_code,
172
+ "content_length": len(clean_text)
173
+ },
174
+ "content_blocks": paragraphs,
175
+ "extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'),
176
+ "data_type": data_type,
177
+ "status": "success"
178
+ }
179
+
180
+ return extracted_data
181
+
182
+ except Exception as e:
183
+ return {"error": f"Extraction error: {str(e)}", "status": "error"}
184
+
185
+ def display_metrics(extracted_data):
186
+ """Display extraction metrics"""
187
+ if not extracted_data:
188
+ return
189
+
190
+ page_info = extracted_data['page_info']
191
+ content_blocks = extracted_data['content_blocks']
192
+
193
+ col1, col2, col3, col4 = st.columns(4)
194
+
195
+ with col1:
196
+ st.metric("Content Blocks", len(content_blocks))
197
+
198
+ with col2:
199
+ total_words = sum(len(block.split()) for block in content_blocks)
200
+ st.metric("Total Words", total_words)
201
+
202
+ with col3:
203
+ st.metric("Characters", f"{page_info['content_length']:,}")
204
+
205
+ with col4:
206
+ st.metric("Response Code", page_info['response_code'])
207
+
208
+ def main():
209
+ st.title("πŸ’Ό LinkedIn AI Analyzer")
210
+
211
+ # Initialize session state - CRITICAL FIX
212
+ if "extracted_data" not in st.session_state:
213
+ st.session_state.extracted_data = None
214
+ if "chat_history" not in st.session_state:
215
+ st.session_state.chat_history = []
216
+ if "processing" not in st.session_state:
217
+ st.session_state.processing = False
218
+ if "current_url" not in st.session_state:
219
+ st.session_state.current_url = ""
220
+ if "last_user_input" not in st.session_state:
221
+ st.session_state.last_user_input = ""
222
+
223
+ # Sidebar
224
+ with st.sidebar:
225
+ st.markdown("### βš™οΈ Configuration")
226
+
227
+ data_type = st.selectbox("πŸ“Š Content Type", ["profile", "company", "post"])
228
+
229
+ url_placeholder = {
230
+ "profile": "https://www.linkedin.com/in/username/",
231
+ "company": "https://www.linkedin.com/company/companyname/",
232
+ "post": "https://www.linkedin.com/posts/username_postid/"
233
+ }
234
+
235
+ linkedin_url = st.text_input(
236
+ "🌐 LinkedIn URL",
237
+ placeholder=url_placeholder[data_type],
238
+ help="Enter a public LinkedIn URL"
239
+ )
240
+
241
+ # Quick test URLs
242
+ st.markdown("### πŸš€ Quick Test")
243
+ test_urls = {
244
+ "Microsoft": "https://www.linkedin.com/company/microsoft/",
245
+ "Google": "https://www.linkedin.com/company/google/",
246
+ "Apple": "https://www.linkedin.com/company/apple/",
247
+ }
248
+
249
+ for name, url in test_urls.items():
250
+ if st.button(f"🏒 {name}", key=name, use_container_width=True):
251
+ st.session_state.current_url = url
252
+ st.rerun()
253
+
254
+ # Extract button
255
+ if st.button("πŸš€ Extract & Analyze", type="primary", use_container_width=True):
256
+ url_to_use = linkedin_url.strip() or st.session_state.current_url
257
+
258
+ if not url_to_use:
259
+ st.warning("⚠️ Please enter a LinkedIn URL")
260
+ elif not url_to_use.startswith('https://www.linkedin.com/'):
261
+ st.error("❌ Please enter a valid LinkedIn URL")
262
+ else:
263
+ st.session_state.processing = True
264
+ with st.spinner("πŸ”„ Extracting LinkedIn data..."):
265
+ extracted_data = extract_linkedin_data(url_to_use, data_type)
266
+
267
+ if extracted_data.get("status") == "success":
268
+ st.session_state.extracted_data = extracted_data
269
+ st.session_state.current_url = url_to_use
270
+ st.session_state.chat_history = [] # Clear previous chat
271
+ st.session_state.last_user_input = "" # Reset last input
272
+ st.success("βœ… Data extracted successfully!")
273
+ st.balloons()
274
+ else:
275
+ error_msg = extracted_data.get("error", "Unknown error")
276
+ st.error(f"❌ Extraction failed: {error_msg}")
277
+
278
+ st.session_state.processing = False
279
+
280
+ # Chat management
281
+ if st.session_state.extracted_data:
282
+ st.markdown("---")
283
+ st.subheader("πŸ’¬ Chat Management")
284
+ if st.button("πŸ—‘οΈ Clear Chat", type="secondary", use_container_width=True):
285
+ st.session_state.chat_history = []
286
+ st.session_state.last_user_input = ""
287
+ st.success("πŸ—‘οΈ Chat history cleared!")
288
+
289
+ # Main content area
290
+ st.markdown("### πŸ“Š Extraction Results")
291
+
292
+ if st.session_state.processing:
293
+ st.info("πŸ”„ Processing LinkedIn data...")
294
+
295
+ elif st.session_state.extracted_data:
296
+ data = st.session_state.extracted_data
297
+ page_info = data['page_info']
298
+ content_blocks = data['content_blocks']
299
+
300
+ st.success("βœ… Extraction Complete")
301
+
302
+ # Display metrics
303
+ display_metrics(data)
304
+
305
+ # Display page info and sample content in columns
306
+ col1, col2 = st.columns(2)
307
+
308
+ with col1:
309
+ st.markdown("#### 🏷️ Page Information")
310
+ st.write(f"**Title:** {page_info['title']}")
311
+ st.write(f"**URL:** {page_info['url']}")
312
+ st.write(f"**Type:** {data['data_type'].title()}")
313
+ st.write(f"**Content Blocks:** {len(content_blocks)}")
314
+ st.write(f"**Extracted:** {data['extraction_time']}")
315
+
316
+ with col2:
317
+ st.markdown("#### πŸ“ Sample Content")
318
+ for i, block in enumerate(content_blocks[:3]):
319
+ with st.expander(f"Block {i+1} ({len(block.split())} words)"):
320
+ st.write(block)
321
+
322
+ if len(content_blocks) > 3:
323
+ st.info(f"πŸ“„ +{len(content_blocks) - 3} more blocks")
324
+
325
+ else:
326
+ st.info("""
327
+ πŸ‘‹ **Welcome to LinkedIn AI Analyzer!**
328
+
329
+ **To get started:**
330
+ 1. Select content type in sidebar
331
+ 2. Enter a LinkedIn URL or click suggested company
332
+ 3. Click "Extract & Analyze"
333
+ 4. Chat with the AI below about the extracted content
334
+
335
+ **Supported URLs:**
336
+ - πŸ‘€ Public Profiles
337
+ - 🏒 Company Pages
338
+ - πŸ“ Public Posts
339
+ """)
340
+
341
+ # Chat section
342
+ st.markdown("---")
343
+ st.markdown("### πŸ’¬ Chat with AI")
344
+
345
+ has_data = st.session_state.extracted_data and st.session_state.extracted_data.get("status") == "success"
346
+
347
+ if has_data:
348
+ st.success("πŸ’¬ Chat ready! Ask questions about the LinkedIn data below.")
349
+
350
+ # Display chat history - ONLY ONCE
351
+ for chat in st.session_state.chat_history:
352
+ if chat["role"] == "user":
353
+ with st.chat_message("user"):
354
+ st.write(chat['content'])
355
+ elif chat["role"] == "assistant":
356
+ with st.chat_message("assistant"):
357
+ st.write(chat['content'])
358
+
359
+ # Suggested questions when no history
360
+ if len(st.session_state.chat_history) == 0:
361
+ st.markdown("#### πŸ’‘ Try asking:")
362
+ suggestions = [
363
+ "What is this post about?",
364
+ "Summarize this content",
365
+ "What projects are mentioned?",
366
+ "Tell me about the GitHub profile"
367
+ ]
368
+
369
+ cols = st.columns(len(suggestions))
370
+ for i, suggestion in enumerate(suggestions):
371
+ with cols[i]:
372
+ if st.button(suggestion, key=f"sugg_{i}", use_container_width=True):
373
+ st.info(f"πŸ’‘ Type: '{suggestion}' in the chat below")
374
+
375
+ # CHAT INPUT - WITH DUPLICATION PROTECTION
376
+ if has_data:
377
+ user_input = st.chat_input("Type your question about the LinkedIn data here...")
378
+
379
+ if user_input and user_input != st.session_state.last_user_input:
380
+ # Store the current input to prevent duplication
381
+ st.session_state.last_user_input = user_input
382
+
383
+ # Add user message
384
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
385
+
386
+ # Generate and add AI response
387
+ with st.spinner("πŸ€” Analyzing..."):
388
+ response = enhanced_chat_analysis(user_input, st.session_state.extracted_data)
389
+ st.session_state.chat_history.append({"role": "assistant", "content": response})
390
+
391
+ # Force rerun to show updated chat
392
+ st.rerun()
393
+
394
+ # Features section at bottom
395
+ st.markdown("---")
396
+ st.markdown("### πŸš€ Features")
397
+
398
+ feature_cols = st.columns(3)
399
+
400
+ with feature_cols[0]:
401
+ st.markdown("""
402
+ **πŸ“Š Data Extraction**
403
+ - LinkedIn content scraping
404
+ - Text processing
405
+ - Content analysis
406
+ """)
407
+
408
+ with feature_cols[1]:
409
+ st.markdown("""
410
+ **πŸ’¬ Smart Chat**
411
+ - Interactive Q&A
412
+ - Content analysis
413
+ - Professional insights
414
+ """)
415
+
416
+ with feature_cols[2]:
417
+ st.markdown("""
418
+ **πŸ” Insights**
419
+ - Summary generation
420
+ - Skill detection
421
+ - Experience analysis
422
+ """)
423
+
424
+ if __name__ == "__main__":
425
+ main()