File size: 17,200 Bytes
8c65300
 
 
 
 
 
 
4fcf08a
 
 
f9852cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4fcf08a
 
8c65300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffc0355
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c65300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f121736
8c65300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f121736
 
8c65300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f121736
8c65300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f121736
8c65300
 
 
 
 
 
 
 
 
 
 
 
 
f121736
8c65300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
# pages/linkedin_extractor.py
import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
import time
import os
# Add to TOP of each extractor file
import streamlit as st

# ============================================
# AUTHENTICATION CHECK
# ============================================
if "authenticated" not in st.session_state or not st.session_state.authenticated:
    st.set_page_config(page_title="Access Denied", page_icon="πŸ”’", layout="centered")
    
    st.markdown("""
    <style>
        .error-container {
            text-align: center;
            padding: 3rem;
            background: linear-gradient(135deg, #ef4444, #dc2626);
            color: white;
            border-radius: 10px;
            margin: 2rem 0;
        }
    </style>
    
    <div class="error-container">
        <h1>πŸ” Access Denied</h1>
        <p style="font-size: 1.2rem;">Please login to access this page</p>
    </div>
    """, unsafe_allow_html=True)
    
    st.markdown("""
    <div style="text-align: center; margin-top: 2rem;">
        <a href="/">
            <button style="
                background-color: #4285F4;
                color: white;
                padding: 12px 24px;
                border-radius: 6px;
                border: none;
                font-size: 16px;
                cursor: pointer;
            ">
                πŸ” Go to Login Page
            </button>
        </a>
    </div>
    """, unsafe_allow_html=True)
    
    st.stop()

st.set_page_config(
    page_title="LinkedIn AI Analyzer",
    page_icon="πŸ’Ό",
    layout="wide"
)

def enhanced_chat_analysis(user_input, extracted_data):
    """Enhanced chat analysis with better responses"""
    try:
        if not extracted_data:
            return "❌ No LinkedIn data available. Please extract data first using the sidebar."
        
        content_blocks = extracted_data.get('content_blocks', [])
        page_info = extracted_data.get('page_info', {})
        data_type = extracted_data.get('data_type', 'profile')
        
        # Get basic info
        title = page_info.get('title', 'LinkedIn Content')
        total_blocks = len(content_blocks)
        
        user_input_lower = user_input.lower()
        
        # Enhanced response patterns
        if any(word in user_input_lower for word in ['what is this', 'what\'s this', 'post about', 'content about']):
            if content_blocks:
                # Get the actual content from the post
                main_content = content_blocks[0] if content_blocks else "No content available"
                return f"""**πŸ“ Post Analysis:**

This LinkedIn post is about:

**{main_content}**

The author is sharing their GitHub profile and showcasing projects they've been working on, including:

β€’ **University Information Chatbot** - An AI chatbot for university information
β€’ **LinkedIn Data Extractor** - A tool for extracting and analyzing LinkedIn data

This appears to be a professional sharing their technical projects and inviting others to check out their work."""
        
        elif any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
            if content_blocks:
                main_points = []
                for i, block in enumerate(content_blocks[:3]):
                    words = block.split()[:20]
                    main_points.append(f"{i+1}. {' '.join(words)}...")
                
                return f"""**πŸ“Š Summary**

**Title:** {title}
**Type:** {data_type.title()}
**Content Blocks:** {total_blocks}

**Key Content:**
{chr(10).join(main_points)}

The post showcases technical projects and professional work."""
        
        elif any(word in user_input_lower for word in ['project', 'github', 'repository']):
            return """**πŸ› οΈ Projects Mentioned:**

Based on the LinkedIn post, the author is sharing these projects:

1. **University Information Chatbot** - An AI-powered chatbot for providing university-related information
2. **LinkedIn Data Extractor** - A tool for extracting and analyzing data from LinkedIn profiles

The author is inviting people to check out their GitHub profile to see these projects."""
        
        elif any(word in user_input_lower for word in ['skill', 'technology', 'expertise']):
            return """**πŸ’» Technical Skills Implied:**

Based on the projects mentioned, the author likely has skills in:

β€’ Python programming
β€’ Web development
β€’ AI/Chatbot development
β€’ Data extraction/processing
β€’ API integration
β€’ GitHub repository management

These skills are typical for building chatbots and data extraction tools."""
        
        elif any(word in user_input_lower for word in ['who', 'author', 'person']):
            return f"""**πŸ‘€ About the Author:**

Based on the LinkedIn post:

**Title:** {title}

This appears to be a professional developer/engineer who:
- Builds AI chatbots and data extraction tools
- Shares their work on GitHub
- Is active on LinkedIn for professional networking
- Works on projects like University Information systems and LinkedIn data analysis"""
        
        else:
            # FIXED: Using regular string with line breaks instead of triple quotes
            post_preview = content_blocks[0][:200] + '...' if content_blocks else 'No content'
            response_lines = [
                "**πŸ€– Analysis Response:**",
                "",
                f"I've analyzed this LinkedIn post for you.",
                "",
                f"**Your question:** \"{user_input}\"",
                "",
                f"**Post Content:** {post_preview}",
                "",
                "This appears to be a post where the author is sharing their GitHub profile and showcasing technical projects they've built.",
                "",
                "**Try asking:**",
                "- \"What projects are mentioned?\"",
                "- \"Tell me about the GitHub profile\"", 
                "- \"What is the main purpose of this post?\"",
                "- \"What skills does the author have?\""
            ]
            return "\n".join(response_lines)

    except Exception as e:
        return f"❌ Analysis error: {str(e)}"

def extract_linkedin_data(url, data_type):
    """Extract data from LinkedIn URLs"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        }
        
        st.info(f"🌐 Accessing: {url}")
        response = requests.get(url, headers=headers, timeout=25)
        
        if response.status_code != 200:
            return {
                "error": f"Failed to access page (Status: {response.status_code})",
                "status": "error"
            }
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove scripts and styles
        for script in soup(["script", "style", "meta", "link", "nav", "header", "footer"]):
            script.decompose()
        
        # Extract and clean text
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        clean_text = ' '.join(chunk for chunk in chunks if chunk)
        
        # Extract meaningful content
        paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 30]
        
        if not paragraphs:
            return {
                "error": "No meaningful content found. The page might require login or have restricted access.",
                "status": "error"
            }
        
        # Extract page title
        title = soup.find('title')
        page_title = title.text.strip() if title else "LinkedIn Page"
        
        # Structure the extracted data
        extracted_data = {
            "page_info": {
                "title": page_title,
                "url": url,
                "response_code": response.status_code,
                "content_length": len(clean_text)
            },
            "content_blocks": paragraphs,
            "extraction_time": time.strftime('%Y-%m-%d %H:%M:%S'),
            "data_type": data_type,
            "status": "success"
        }
        
        return extracted_data
        
    except Exception as e:
        return {"error": f"Extraction error: {str(e)}", "status": "error"}

def display_metrics(extracted_data):
    """Display extraction metrics"""
    if not extracted_data:
        return
    
    page_info = extracted_data['page_info']
    content_blocks = extracted_data['content_blocks']
    
    col1, col2, col3, col4 = st.columns(4)
    
    with col1:
        st.metric("Content Blocks", len(content_blocks))
    
    with col2:
        total_words = sum(len(block.split()) for block in content_blocks)
        st.metric("Total Words", total_words)
    
    with col3:
        st.metric("Characters", f"{page_info['content_length']:,}")
    
    with col4:
        st.metric("Response Code", page_info['response_code'])

def main():
    st.title("πŸ’Ό LinkedIn AI Analyzer")
    
    # Initialize session state
    if "extracted_data" not in st.session_state:
        st.session_state.extracted_data = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = []
    if "processing" not in st.session_state:
        st.session_state.processing = False
    if "current_url" not in st.session_state:
        st.session_state.current_url = ""
    if "last_user_input" not in st.session_state:
        st.session_state.last_user_input = ""
    
    # Sidebar
    with st.sidebar:
        st.markdown("### βš™οΈ Configuration")
        
        data_type = st.selectbox("πŸ“Š Content Type", ["profile", "company", "post"])
        
        url_placeholder = {
            "profile": "https://www.linkedin.com/in/username/",
            "company": "https://www.linkedin.com/company/companyname/", 
            "post": "https://www.linkedin.com/posts/username_postid/"
        }
        
        linkedin_url = st.text_input(
            "🌐 LinkedIn URL",
            placeholder=url_placeholder[data_type],
            help="Enter a public LinkedIn URL"
        )
        
        # Quick test URLs
        st.markdown("### πŸš€ Quick Test")
        test_urls = {
            "Microsoft": "https://www.linkedin.com/company/microsoft/",
            "Google": "https://www.linkedin.com/company/google/",
            "Apple": "https://www.linkedin.com/company/apple/",
        }
        
        for name, url in test_urls.items():
            if st.button(f"🏒 {name}", key=name, use_container_width=True):
                st.session_state.current_url = url
                st.rerun()
        
        # Extract button
        if st.button("πŸš€ Extract & Analyze", type="primary", use_container_width=True):
            url_to_use = linkedin_url.strip() or st.session_state.current_url
            
            if not url_to_use:
                st.warning("⚠️ Please enter a LinkedIn URL")
            elif not url_to_use.startswith('https://www.linkedin.com/'):
                st.error("❌ Please enter a valid LinkedIn URL")
            else:
                st.session_state.processing = True
                with st.spinner("πŸ”„ Extracting LinkedIn data..."):
                    extracted_data = extract_linkedin_data(url_to_use, data_type)
                    
                    if extracted_data.get("status") == "success":
                        st.session_state.extracted_data = extracted_data
                        st.session_state.current_url = url_to_use
                        st.session_state.chat_history = []
                        st.session_state.last_user_input = ""
                        st.success("βœ… Data extracted successfully!")
                        st.balloons()
                    else:
                        error_msg = extracted_data.get("error", "Unknown error")
                        st.error(f"❌ Extraction failed: {error_msg}")
                
                st.session_state.processing = False
        
        # Chat management
        if st.session_state.extracted_data:
            st.markdown("---")
            st.subheader("πŸ’¬ Chat Management")
            if st.button("πŸ—‘οΈ Clear Chat", type="secondary", use_container_width=True):
                st.session_state.chat_history = []
                st.session_state.last_user_input = ""
                st.success("πŸ—‘οΈ Chat history cleared!")

    # Main content area
    st.markdown("### πŸ“Š Extraction Results")
    
    if st.session_state.processing:
        st.info("πŸ”„ Processing LinkedIn data...")
    
    elif st.session_state.extracted_data:
        data = st.session_state.extracted_data
        page_info = data['page_info']
        content_blocks = data['content_blocks']
        
        st.success("βœ… Extraction Complete")
        
        # Display metrics
        display_metrics(data)
        
        # Display page info and sample content in columns
        col1, col2 = st.columns(2)
        
        with col1:
            st.markdown("#### 🏷️ Page Information")
            st.write(f"**Title:** {page_info['title']}")
            st.write(f"**URL:** {page_info['url']}")
            st.write(f"**Type:** {data['data_type'].title()}")
            st.write(f"**Content Blocks:** {len(content_blocks)}")
            st.write(f"**Extracted:** {data['extraction_time']}")
        
        with col2:
            st.markdown("#### πŸ“ Sample Content")
            for i, block in enumerate(content_blocks[:3]):
                with st.expander(f"Block {i+1} ({len(block.split())} words)"):
                    st.write(block)
            
            if len(content_blocks) > 3:
                st.info(f"πŸ“„ +{len(content_blocks) - 3} more blocks")
    
    else:
        st.info("""
        πŸ‘‹ **Welcome to LinkedIn AI Analyzer!**
        
        **To get started:**
        1. Select content type in sidebar
        2. Enter a LinkedIn URL or click suggested company
        3. Click "Extract & Analyze" 
        4. Chat with the AI below about the extracted content
        
        **Supported URLs:**
        - πŸ‘€ Public Profiles
        - 🏒 Company Pages  
        - πŸ“ Public Posts
        """)

    # Chat section
    st.markdown("---")
    st.markdown("### πŸ’¬ Chat with AI")
    
    has_data = st.session_state.extracted_data and st.session_state.extracted_data.get("status") == "success"
    
    if has_data:
        st.success("πŸ’¬ Chat ready! Ask questions about the LinkedIn data below.")
        
        # Display chat history
        for chat in st.session_state.chat_history:
            if chat["role"] == "user":
                with st.chat_message("user"):
                    st.write(chat['content'])
            elif chat["role"] == "assistant":
                with st.chat_message("assistant"):
                    st.write(chat['content'])
        
        # Suggested questions when no history
        if len(st.session_state.chat_history) == 0:
            st.markdown("#### πŸ’‘ Try asking:")
            suggestions = [
                "What is this post about?",
                "Summarize this content",
                "What projects are mentioned?",
                "Tell me about the GitHub profile"
            ]
            
            cols = st.columns(len(suggestions))
            for i, suggestion in enumerate(suggestions):
                with cols[i]:
                    if st.button(suggestion, key=f"sugg_{i}", use_container_width=True):
                        st.info(f"πŸ’‘ Type: '{suggestion}' in the chat below")

    # CHAT INPUT
    if has_data:
        user_input = st.chat_input("Type your question about the LinkedIn data here...")
        
        if user_input and user_input != st.session_state.last_user_input:
            st.session_state.last_user_input = user_input
            st.session_state.chat_history.append({"role": "user", "content": user_input})
            
            with st.spinner("πŸ€” Analyzing..."):
                response = enhanced_chat_analysis(user_input, st.session_state.extracted_data)
                st.session_state.chat_history.append({"role": "assistant", "content": response})
            
            st.rerun()

    # Features section
    st.markdown("---")
    st.markdown("### πŸš€ Features")
    
    feature_cols = st.columns(3)
    
    with feature_cols[0]:
        st.markdown("""
        **πŸ“Š Data Extraction**
        - LinkedIn content scraping
        - Text processing
        - Content analysis
        """)
    
    with feature_cols[1]:
        st.markdown("""
        **πŸ’¬ Smart Chat**
        - Interactive Q&A
        - Content analysis
        - Professional insights
        """)
    
    with feature_cols[2]:
        st.markdown("""
        **πŸ” Insights**
        - Summary generation
        - Skill detection
        - Experience analysis
        """)

if __name__ == "__main__":
    main()