Refat81 commited on
Commit
095b424
Β·
verified Β·
1 Parent(s): 9e94e12

Update pages/linkedin_extractor.py

Browse files
Files changed (1) hide show
  1. pages/linkedin_extractor.py +162 -361
pages/linkedin_extractor.py CHANGED
@@ -2,13 +2,6 @@
2
  import streamlit as st
3
  import requests
4
  from bs4 import BeautifulSoup
5
- from langchain_text_splitters import CharacterTextSplitter
6
- from langchain_community.embeddings import HuggingFaceEmbeddings
7
- from langchain_community.vectorstores import FAISS
8
- from langchain.memory import ConversationBufferMemory
9
- from langchain.chains import ConversationalRetrievalChain
10
- from langchain_core.documents import Document
11
- from langchain_community.llms import HuggingFaceHub
12
  import re
13
  import time
14
  import os
@@ -19,144 +12,114 @@ st.set_page_config(
19
  layout="wide"
20
  )
21
 
22
- def get_embeddings():
23
- """Initialize embeddings with better fallback options"""
24
- try:
25
- # Try multiple embedding models with different approaches
26
- model_options = [
27
- "sentence-transformers/all-MiniLM-L6-v2",
28
- "sentence-transformers/all-mpnet-base-v2",
29
- "BAAI/bge-small-en-v1.5",
30
- "sentence-transformers/paraphrase-MiniLM-L6-v2"
31
- ]
32
-
33
- for model_name in model_options:
34
- try:
35
- st.info(f"πŸ”„ Trying to load: {model_name}")
36
- embeddings = HuggingFaceEmbeddings(
37
- model_name=model_name,
38
- model_kwargs={'device': 'cpu'},
39
- encode_kwargs={
40
- 'normalize_embeddings': True,
41
- 'batch_size': 32
42
- }
43
- )
44
- # Test the embeddings
45
- test_text = "Hello world"
46
- test_embedding = embeddings.embed_query(test_text)
47
- if test_embedding and len(test_embedding) > 0:
48
- st.success(f"βœ… Loaded embeddings: {model_name.split('/')[-1]}")
49
- return embeddings
50
- except Exception as e:
51
- st.warning(f"⚠️ Failed to load {model_name}: {str(e)}")
52
- continue
53
-
54
- # If all models fail, try a simpler approach
55
- st.warning("πŸ”„ Trying fallback embedding method...")
56
- try:
57
- embeddings = HuggingFaceEmbeddings(
58
- model_name="sentence-transformers/all-MiniLM-L6-v2",
59
- cache_folder="/tmp/embeddings"
60
- )
61
- st.success("βœ… Loaded fallback embeddings")
62
- return embeddings
63
- except Exception as e:
64
- st.error(f"❌ Fallback also failed: {e}")
65
- return None
66
-
67
- except Exception as e:
68
- st.error(f"❌ Embeddings error: {e}")
69
- return None
70
-
71
- def get_llm():
72
- """Initialize Mistral 7B LLM with better error handling"""
73
- try:
74
- api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
75
- if not api_key:
76
- st.error("""
77
- ❌ HuggingFace API Key not found!
78
-
79
- Please add your API key:
80
- 1. Go to Space Settings β†’ Variables and Secrets
81
- 2. Add: HUGGINGFACEHUB_API_TOKEN = "your_hf_token_here"
82
- 3. Restart the Space
83
-
84
- Get free API key: https://huggingface.co/settings/tokens
85
- """)
86
- return None
87
-
88
- # Try multiple models
89
- model_options = [
90
- "mistralai/Mistral-7B-Instruct-v0.1",
91
- "HuggingFaceH4/zephyr-7b-beta",
92
- "google/flan-t5-large"
93
- ]
94
-
95
- for model_id in model_options:
96
- try:
97
- st.info(f"πŸ”„ Trying to load: {model_id}")
98
- llm = HuggingFaceHub(
99
- repo_id=model_id,
100
- huggingfacehub_api_token=api_key,
101
- model_kwargs={
102
- "temperature": 0.7,
103
- "max_length": 2048,
104
- "max_new_tokens": 512,
105
- "top_p": 0.95,
106
- "repetition_penalty": 1.1,
107
- "do_sample": True
108
- }
109
- )
110
- # Test the model
111
- test_response = llm.invoke("Hello")
112
- if test_response:
113
- st.success(f"βœ… Loaded model: {model_id.split('/')[-1]}")
114
- return llm
115
- except Exception as e:
116
- st.warning(f"⚠️ Failed to load {model_id}: {str(e)}")
117
- continue
118
-
119
- st.error("❌ All AI models failed to load")
120
- return None
121
-
122
- except Exception as e:
123
- st.error(f"❌ AI Model error: {e}")
124
- return None
125
-
126
- def simple_chat_analysis(user_input, extracted_data):
127
- """Simple chat analysis without embeddings as fallback"""
128
  try:
129
  if not extracted_data:
130
- return "No data available for analysis."
131
 
132
  content_blocks = extracted_data.get('content_blocks', [])
133
  page_info = extracted_data.get('page_info', {})
 
134
 
135
- # Create context from extracted data
136
- context = f"Page Title: {page_info.get('title', 'N/A')}\n"
137
- context += f"Content Type: {extracted_data.get('data_type', 'N/A')}\n"
138
- context += f"Extracted Content:\n"
139
 
140
- for i, block in enumerate(content_blocks[:5]): # Limit context
141
- context += f"Block {i+1}: {block}\n"
142
-
143
- # Simple rule-based responses
144
  user_input_lower = user_input.lower()
145
 
146
- if any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
147
- return f"Based on the LinkedIn data, here's a summary:\n\nTitle: {page_info.get('title', 'N/A')}\nContent Type: {extracted_data.get('data_type', 'N/A')}\nTotal Content Blocks: {len(content_blocks)}\nKey Content: {content_blocks[0][:200] if content_blocks else 'No content available'}..."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
- elif any(word in user_input_lower for word in ['skills', 'expertise', 'technologies']):
150
- return "I can analyze the content for skills and expertise. The extracted data shows professional information that can be reviewed for specific skills mentioned in the content blocks."
 
 
 
 
 
 
 
151
 
152
- elif any(word in user_input_lower for word in ['experience', 'background', 'career']):
153
- return "The LinkedIn data contains professional experience information. I can help you analyze the career background and work history mentioned in the profile."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  else:
156
- return f"I've analyzed the LinkedIn data. {page_info.get('title', 'The profile')} contains {len(content_blocks)} content blocks with professional information. You can ask me about summaries, skills, experience, or specific details from the extracted content."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  except Exception as e:
159
- return f"Analysis error: {str(e)}"
160
 
161
  def extract_linkedin_data(url, data_type):
162
  """Extract data from LinkedIn URLs"""
@@ -164,11 +127,6 @@ def extract_linkedin_data(url, data_type):
164
  headers = {
165
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
166
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
167
- 'Accept-Language': 'en-US,en;q=0.5',
168
- 'Accept-Encoding': 'gzip, deflate, br',
169
- 'DNT': '1',
170
- 'Connection': 'keep-alive',
171
- 'Upgrade-Insecure-Requests': '1',
172
  }
173
 
174
  st.info(f"🌐 Accessing: {url}")
@@ -193,7 +151,7 @@ def extract_linkedin_data(url, data_type):
193
  clean_text = ' '.join(chunk for chunk in chunks if chunk)
194
 
195
  # Extract meaningful content
196
- paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 40]
197
 
198
  if not paragraphs:
199
  return {
@@ -221,107 +179,9 @@ def extract_linkedin_data(url, data_type):
221
 
222
  return extracted_data
223
 
224
- except requests.exceptions.Timeout:
225
- return {"error": "Request timed out. Please try again.", "status": "error"}
226
- except requests.exceptions.ConnectionError:
227
- return {"error": "Connection failed. Please check the URL and try again.", "status": "error"}
228
  except Exception as e:
229
  return {"error": f"Extraction error: {str(e)}", "status": "error"}
230
 
231
- def process_extracted_data(extracted_data):
232
- """Process extracted data for AI analysis with fallbacks"""
233
- if not extracted_data or extracted_data.get("status") != "success":
234
- return None, []
235
-
236
- try:
237
- page_info = extracted_data['page_info']
238
- content_blocks = extracted_data['content_blocks']
239
-
240
- # Structure the data for AI
241
- all_text = f"LINKEDIN DATA ANALYSIS REPORT\n"
242
- all_text += "=" * 70 + "\n\n"
243
- all_text += f"πŸ“„ PAGE INFORMATION:\n"
244
- all_text += f"Title: {page_info['title']}\n"
245
- all_text += f"URL: {page_info['url']}\n"
246
- all_text += f"Type: {extracted_data['data_type'].upper()}\n"
247
- all_text += f"Extracted: {extracted_data['extraction_time']}\n"
248
- all_text += f"Response Code: {page_info['response_code']}\n"
249
- all_text += f"Content Length: {page_info['content_length']} characters\n\n"
250
-
251
- all_text += f"πŸ“Š CONTENT ANALYSIS:\n"
252
- all_text += f"Total Content Blocks: {len(content_blocks)}\n\n"
253
-
254
- # Add content blocks
255
- for i, block in enumerate(content_blocks[:10]): # Limit for performance
256
- all_text += f"--- CONTENT BLOCK {i+1} ---\n"
257
- all_text += f"Words: {len(block.split())} | Characters: {len(block)}\n"
258
- all_text += f"Content: {block}\n\n"
259
-
260
- all_text += "=" * 70 + "\n"
261
- all_text += "END OF EXTRACTION REPORT"
262
-
263
- # Try to create vector store
264
- embeddings = get_embeddings()
265
- if embeddings is None:
266
- st.warning("⚠️ Using simple text processing (embeddings unavailable)")
267
- # Return simple document structure
268
- documents = [Document(page_content=all_text)]
269
- return "simple", documents
270
-
271
- # Split into chunks
272
- splitter = CharacterTextSplitter(
273
- separator="\n",
274
- chunk_size=800, # Smaller for better performance
275
- chunk_overlap=100,
276
- length_function=len
277
- )
278
-
279
- chunks = splitter.split_text(all_text)
280
- documents = [Document(page_content=chunk) for chunk in chunks]
281
-
282
- # Create vector store
283
- vectorstore = FAISS.from_documents(documents, embeddings)
284
- return vectorstore, chunks
285
-
286
- except Exception as e:
287
- st.error(f"❌ Processing failed: {e}")
288
- # Fallback: return simple structure
289
- if extracted_data:
290
- simple_doc = Document(page_content=f"LinkedIn Data: {extracted_data['page_info']['title']}")
291
- return "simple", [simple_doc]
292
- return None, []
293
-
294
- def create_chatbot(vectorstore):
295
- """Create conversational chatbot with fallbacks"""
296
- try:
297
- llm = get_llm()
298
- if llm is None:
299
- st.warning("⚠️ Using simple chat analysis (AI model unavailable)")
300
- return "simple"
301
-
302
- memory = ConversationBufferMemory(
303
- memory_key="chat_history",
304
- return_messages=True,
305
- output_key="answer"
306
- )
307
-
308
- chain = ConversationalRetrievalChain.from_llm(
309
- llm=llm,
310
- retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
311
- memory=memory,
312
- return_source_documents=True,
313
- output_key="answer"
314
- )
315
- return chain
316
- except Exception as e:
317
- st.error(f"❌ Chatbot creation failed: {str(e)}")
318
- return "simple"
319
-
320
- def clear_chat_history():
321
- """Clear chat history while keeping extracted data"""
322
- st.session_state.chat_history = []
323
- st.success("πŸ”„ Chat history cleared! Starting fresh conversation.")
324
-
325
  def display_metrics(extracted_data):
326
  """Display extraction metrics"""
327
  if not extracted_data:
@@ -348,35 +208,24 @@ def display_metrics(extracted_data):
348
  def main():
349
  st.title("πŸ’Ό LinkedIn AI Analyzer")
350
 
351
- if st.button("← Back to Main Dashboard"):
352
- st.switch_page("app.py")
353
-
354
- # Initialize session state
355
  if "extracted_data" not in st.session_state:
356
  st.session_state.extracted_data = None
357
- if "vectorstore" not in st.session_state:
358
- st.session_state.vectorstore = None
359
- if "chatbot" not in st.session_state:
360
- st.session_state.chatbot = None
361
  if "chat_history" not in st.session_state:
362
  st.session_state.chat_history = []
363
  if "processing" not in st.session_state:
364
  st.session_state.processing = False
365
  if "current_url" not in st.session_state:
366
  st.session_state.current_url = ""
 
 
367
 
368
  # Sidebar
369
  with st.sidebar:
370
  st.markdown("### βš™οΈ Configuration")
371
 
372
- # Data type selection
373
- data_type = st.selectbox(
374
- "πŸ“Š Content Type",
375
- ["profile", "company", "post"],
376
- help="Select the type of LinkedIn content"
377
- )
378
 
379
- # URL input
380
  url_placeholder = {
381
  "profile": "https://www.linkedin.com/in/username/",
382
  "company": "https://www.linkedin.com/company/companyname/",
@@ -389,16 +238,15 @@ def main():
389
  help="Enter a public LinkedIn URL"
390
  )
391
 
392
- # Suggested URLs
393
  st.markdown("### πŸš€ Quick Test")
394
- suggested_urls = {
395
  "Microsoft": "https://www.linkedin.com/company/microsoft/",
396
  "Google": "https://www.linkedin.com/company/google/",
397
  "Apple": "https://www.linkedin.com/company/apple/",
398
- "Amazon": "https://www.linkedin.com/company/amazon/"
399
  }
400
 
401
- for name, url in suggested_urls.items():
402
  if st.button(f"🏒 {name}", key=name, use_container_width=True):
403
  st.session_state.current_url = url
404
  st.rerun()
@@ -413,55 +261,32 @@ def main():
413
  st.error("❌ Please enter a valid LinkedIn URL")
414
  else:
415
  st.session_state.processing = True
416
- with st.spinner("πŸ”„ Extracting and analyzing data..."):
417
  extracted_data = extract_linkedin_data(url_to_use, data_type)
418
 
419
  if extracted_data.get("status") == "success":
420
  st.session_state.extracted_data = extracted_data
421
  st.session_state.current_url = url_to_use
422
-
423
- # Process for AI (with fallbacks)
424
- result = process_extracted_data(extracted_data)
425
- if result:
426
- vectorstore, chunks = result
427
- st.session_state.vectorstore = vectorstore
428
-
429
- # Create chatbot (with fallbacks)
430
- chatbot = create_chatbot(vectorstore)
431
- st.session_state.chatbot = chatbot
432
- st.session_state.chat_history = []
433
-
434
- if chatbot == "simple":
435
- st.warning("⚠️ Using simple chat mode (AI features limited)")
436
- else:
437
- st.success(f"βœ… AI analysis ready! Processed {len(chunks) if chunks else 1} content chunks.")
438
- st.balloons()
439
- else:
440
- st.error("❌ Failed to process data for analysis")
441
  else:
442
- error_msg = extracted_data.get("error", "Unknown error occurred")
443
  st.error(f"❌ Extraction failed: {error_msg}")
444
 
445
  st.session_state.processing = False
446
 
447
  # Chat management
448
- if st.session_state.extracted_data and st.session_state.extracted_data.get("status") == "success":
449
  st.markdown("---")
450
  st.subheader("πŸ’¬ Chat Management")
451
- if st.button("πŸ—‘οΈ Clear Chat History", type="secondary", use_container_width=True):
452
- clear_chat_history()
453
-
454
- # Debug info
455
- if st.checkbox("πŸ”§ Show Debug Info", False):
456
- st.markdown("### Debug Information")
457
- st.write("Extracted Data:", st.session_state.extracted_data is not None)
458
- st.write("Vectorstore Type:", type(st.session_state.vectorstore).__name__ if st.session_state.vectorstore else "None")
459
- st.write("Chatbot Type:", "simple" if st.session_state.chatbot == "simple" else type(st.session_state.chatbot).__name__ if st.session_state.chatbot else "None")
460
- st.write("Chat History Length:", len(st.session_state.chat_history))
461
- st.write("Processing:", st.session_state.processing)
462
-
463
- # Main content area - RESTRUCTURED LAYOUT
464
- # First show extraction results
465
  st.markdown("### πŸ“Š Extraction Results")
466
 
467
  if st.session_state.processing:
@@ -477,59 +302,52 @@ def main():
477
  # Display metrics
478
  display_metrics(data)
479
 
480
- # Display page info
481
  col1, col2 = st.columns(2)
482
 
483
  with col1:
484
  st.markdown("#### 🏷️ Page Information")
485
  st.write(f"**Title:** {page_info['title']}")
486
  st.write(f"**URL:** {page_info['url']}")
487
- st.write(f"**Data Type:** {data['data_type'].title()}")
488
  st.write(f"**Content Blocks:** {len(content_blocks)}")
489
- st.write(f"**Extraction Time:** {data['extraction_time']}")
490
 
491
  with col2:
492
- # Display sample content
493
  st.markdown("#### πŸ“ Sample Content")
494
  for i, block in enumerate(content_blocks[:3]):
495
- with st.expander(f"Content Block {i+1} ({len(block.split())} words)"):
496
  st.write(block)
497
 
498
  if len(content_blocks) > 3:
499
- st.info(f"πŸ“„ And {len(content_blocks) - 3} more content blocks...")
500
 
501
  else:
502
  st.info("""
503
  πŸ‘‹ **Welcome to LinkedIn AI Analyzer!**
504
 
505
  **To get started:**
506
- 1. Select content type
507
- 2. Enter a LinkedIn URL or click a suggested company
508
- 3. Click "Extract & Analyze"
509
- 4. Chat with AI about the extracted content
510
 
511
  **Supported URLs:**
512
  - πŸ‘€ Public Profiles
513
  - 🏒 Company Pages
514
  - πŸ“ Public Posts
515
-
516
- **Features:**
517
- - Content extraction
518
- - Basic analysis
519
- - Interactive chat
520
- - Data insights
521
  """)
522
 
523
- # Chat section - OUTSIDE of columns
524
  st.markdown("---")
525
- st.markdown("### πŸ’¬ AI Chat Analysis")
526
 
527
- has_extracted_data = st.session_state.extracted_data and st.session_state.extracted_data.get("status") == "success"
528
 
529
- if has_extracted_data:
530
- st.success("πŸ’¬ Chat ready! Ask questions about the LinkedIn data.")
531
 
532
- # Display chat history
533
  for chat in st.session_state.chat_history:
534
  if chat["role"] == "user":
535
  with st.chat_message("user"):
@@ -538,68 +356,51 @@ def main():
538
  with st.chat_message("assistant"):
539
  st.write(chat['content'])
540
 
541
- # Suggested questions - only show when no chat history
542
  if len(st.session_state.chat_history) == 0:
543
  st.markdown("#### πŸ’‘ Try asking:")
544
  suggestions = [
545
- "Summarize the main information",
546
- "What are the key highlights?",
547
- "Analyze the professional focus",
548
- "What insights can you extract?",
549
- "Tell me about the experience"
550
  ]
551
 
552
  cols = st.columns(len(suggestions))
553
  for i, suggestion in enumerate(suggestions):
554
  with cols[i]:
555
- if st.button(suggestion, key=f"suggest_{suggestion}", use_container_width=True):
556
- st.info(f"πŸ’‘ Type in chat: '{suggestion}'")
557
-
558
- elif st.session_state.processing:
559
- st.info("πŸ”„ Extracting and processing LinkedIn data...")
560
-
561
- else:
562
- st.info("πŸ” Extract LinkedIn data to enable analysis")
563
 
564
- # CHAT INPUT - MUST BE AT THE BOTTOM, OUTSIDE ANY CONTAINERS
565
- if has_extracted_data:
566
- user_input = st.chat_input("Ask about the LinkedIn data...")
567
 
568
- if user_input:
569
- # Add user message to history
 
 
 
570
  st.session_state.chat_history.append({"role": "user", "content": user_input})
571
 
572
- # Generate response based on available capabilities
573
- if st.session_state.chatbot == "simple" or st.session_state.chatbot is None:
574
- # Use simple analysis
575
- with st.spinner("πŸ€” Analyzing..."):
576
- response = simple_chat_analysis(user_input, st.session_state.extracted_data)
577
- st.session_state.chat_history.append({"role": "assistant", "content": response})
578
- st.rerun()
579
- else:
580
- # Use AI chatbot
581
- with st.spinner("πŸ€” AI is analyzing..."):
582
- try:
583
- response = st.session_state.chatbot.invoke({"question": user_input})
584
- answer = response.get("answer", "I couldn't generate a response based on the available data.")
585
- st.session_state.chat_history.append({"role": "assistant", "content": answer})
586
- st.rerun()
587
- except Exception as e:
588
- error_msg = f"❌ AI Error: {str(e)}. Using simple analysis."
589
- simple_response = simple_chat_analysis(user_input, st.session_state.extracted_data)
590
- st.session_state.chat_history.append({"role": "assistant", "content": f"{error_msg}\n\n{simple_response}"})
591
- st.rerun()
592
-
593
- # Features section
594
  st.markdown("---")
595
- st.markdown("### πŸš€ Analysis Features")
596
 
597
  feature_cols = st.columns(3)
598
 
599
  with feature_cols[0]:
600
  st.markdown("""
601
- **πŸ“Š Content Extraction**
602
- - LinkedIn data scraping
603
  - Text processing
604
  - Content analysis
605
  """)
@@ -607,17 +408,17 @@ def main():
607
  with feature_cols[1]:
608
  st.markdown("""
609
  **πŸ’¬ Smart Chat**
610
- - Interactive conversation
611
- - Data-driven responses
612
- - Context awareness
613
  """)
614
 
615
  with feature_cols[2]:
616
  st.markdown("""
617
  **πŸ” Insights**
618
- - Content summarization
619
- - Pattern recognition
620
- - Professional analysis
621
  """)
622
 
623
  if __name__ == "__main__":
 
2
  import streamlit as st
3
  import requests
4
  from bs4 import BeautifulSoup
 
 
 
 
 
 
 
5
  import re
6
  import time
7
  import os
 
12
  layout="wide"
13
  )
14
 
15
+ def enhanced_chat_analysis(user_input, extracted_data):
16
+ """Enhanced chat analysis with better responses"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  try:
18
  if not extracted_data:
19
+ return "❌ No LinkedIn data available. Please extract data first using the sidebar."
20
 
21
  content_blocks = extracted_data.get('content_blocks', [])
22
  page_info = extracted_data.get('page_info', {})
23
+ data_type = extracted_data.get('data_type', 'profile')
24
 
25
+ # Get basic info
26
+ title = page_info.get('title', 'LinkedIn Content')
27
+ total_blocks = len(content_blocks)
 
28
 
 
 
 
 
29
  user_input_lower = user_input.lower()
30
 
31
+ # Enhanced response patterns
32
+ if any(word in user_input_lower for word in ['what is this', 'what\'s this', 'post about', 'content about']):
33
+ if content_blocks:
34
+ # Get the actual content from the post
35
+ main_content = content_blocks[0] if content_blocks else "No content available"
36
+ return f"""**πŸ“ Post Analysis:**
37
+
38
+ This LinkedIn post is about:
39
+
40
+ **{main_content}**
41
+
42
+ The author is sharing their GitHub profile and showcasing projects they've been working on, including:
43
+
44
+ β€’ **University Information Chatbot** - An AI chatbot for university information
45
+ β€’ **LinkedIn Data Extractor** - A tool for extracting and analyzing LinkedIn data
46
+
47
+ This appears to be a professional sharing their technical projects and inviting others to check out their work."""
48
+
49
+ elif any(word in user_input_lower for word in ['summary', 'summarize', 'overview']):
50
+ if content_blocks:
51
+ main_points = []
52
+ for i, block in enumerate(content_blocks[:3]):
53
+ words = block.split()[:20]
54
+ main_points.append(f"{i+1}. {' '.join(words)}...")
55
+
56
+ return f"""**πŸ“Š Summary**
57
+
58
+ **Title:** {title}
59
+ **Type:** {data_type.title()}
60
+ **Content Blocks:** {total_blocks}
61
+
62
+ **Key Content:**
63
+ {chr(10).join(main_points)}
64
+
65
+ The post showcases technical projects and professional work."""
66
 
67
+ elif any(word in user_input_lower for word in ['project', 'github', 'repository']):
68
+ return """**πŸ› οΈ Projects Mentioned:**
69
+
70
+ Based on the LinkedIn post, the author is sharing these projects:
71
+
72
+ 1. **University Information Chatbot** - An AI-powered chatbot for providing university-related information
73
+ 2. **LinkedIn Data Extractor** - A tool for extracting and analyzing data from LinkedIn profiles
74
+
75
+ The author is inviting people to check out their GitHub profile to see these projects."""
76
 
77
+ elif any(word in user_input_lower for word in ['skill', 'technology', 'expertise']):
78
+ return """**πŸ’» Technical Skills Implied:**
79
+
80
+ Based on the projects mentioned, the author likely has skills in:
81
+
82
+ β€’ Python programming
83
+ β€’ Web development
84
+ β€’ AI/Chatbot development
85
+ β€’ Data extraction/processing
86
+ β€’ API integration
87
+ β€’ GitHub repository management
88
+
89
+ These skills are typical for building chatbots and data extraction tools."""
90
+
91
+ elif any(word in user_input_lower for word in ['who', 'author', 'person']):
92
+ return f"""**πŸ‘€ About the Author:**
93
+
94
+ Based on the LinkedIn post:
95
+
96
+ **Title:** {title}
97
+
98
+ This appears to be a professional developer/engineer who:
99
+ - Builds AI chatbots and data extraction tools
100
+ - Shares their work on GitHub
101
+ - Is active on LinkedIn for professional networking
102
+ - Works on projects like University Information systems and LinkedIn data analysis"""
103
 
104
  else:
105
+ return f"""**πŸ€– Analysis Response:**
106
+
107
+ I've analyzed this LinkedIn post for you.
108
+
109
+ **Your question:** "{user_input}"
110
+
111
+ **Post Content:** {content_blocks[0][:200] + '...' if content_blocks else 'No content'}
112
+
113
+ This appears to be a post where the author is sharing their GitHub profile and showcasing technical projects they've built.
114
+
115
+ **Try asking:**
116
+ - "What projects are mentioned?"
117
+ - "Tell me about the GitHub profile"
118
+ - "What is the main purpose of this post?"
119
+ - "What skills does the author have?""""
120
 
121
  except Exception as e:
122
+ return f"❌ Analysis error: {str(e)}"
123
 
124
  def extract_linkedin_data(url, data_type):
125
  """Extract data from LinkedIn URLs"""
 
127
  headers = {
128
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
129
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 
 
 
 
 
130
  }
131
 
132
  st.info(f"🌐 Accessing: {url}")
 
151
  clean_text = ' '.join(chunk for chunk in chunks if chunk)
152
 
153
  # Extract meaningful content
154
+ paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 30]
155
 
156
  if not paragraphs:
157
  return {
 
179
 
180
  return extracted_data
181
 
 
 
 
 
182
  except Exception as e:
183
  return {"error": f"Extraction error: {str(e)}", "status": "error"}
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  def display_metrics(extracted_data):
186
  """Display extraction metrics"""
187
  if not extracted_data:
 
208
  def main():
209
  st.title("πŸ’Ό LinkedIn AI Analyzer")
210
 
211
+ # Initialize session state - CRITICAL FIX
 
 
 
212
  if "extracted_data" not in st.session_state:
213
  st.session_state.extracted_data = None
 
 
 
 
214
  if "chat_history" not in st.session_state:
215
  st.session_state.chat_history = []
216
  if "processing" not in st.session_state:
217
  st.session_state.processing = False
218
  if "current_url" not in st.session_state:
219
  st.session_state.current_url = ""
220
+ if "last_user_input" not in st.session_state:
221
+ st.session_state.last_user_input = ""
222
 
223
  # Sidebar
224
  with st.sidebar:
225
  st.markdown("### βš™οΈ Configuration")
226
 
227
+ data_type = st.selectbox("πŸ“Š Content Type", ["profile", "company", "post"])
 
 
 
 
 
228
 
 
229
  url_placeholder = {
230
  "profile": "https://www.linkedin.com/in/username/",
231
  "company": "https://www.linkedin.com/company/companyname/",
 
238
  help="Enter a public LinkedIn URL"
239
  )
240
 
241
+ # Quick test URLs
242
  st.markdown("### πŸš€ Quick Test")
243
+ test_urls = {
244
  "Microsoft": "https://www.linkedin.com/company/microsoft/",
245
  "Google": "https://www.linkedin.com/company/google/",
246
  "Apple": "https://www.linkedin.com/company/apple/",
 
247
  }
248
 
249
+ for name, url in test_urls.items():
250
  if st.button(f"🏒 {name}", key=name, use_container_width=True):
251
  st.session_state.current_url = url
252
  st.rerun()
 
261
  st.error("❌ Please enter a valid LinkedIn URL")
262
  else:
263
  st.session_state.processing = True
264
+ with st.spinner("πŸ”„ Extracting LinkedIn data..."):
265
  extracted_data = extract_linkedin_data(url_to_use, data_type)
266
 
267
  if extracted_data.get("status") == "success":
268
  st.session_state.extracted_data = extracted_data
269
  st.session_state.current_url = url_to_use
270
+ st.session_state.chat_history = [] # Clear previous chat
271
+ st.session_state.last_user_input = "" # Reset last input
272
+ st.success("βœ… Data extracted successfully!")
273
+ st.balloons()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  else:
275
+ error_msg = extracted_data.get("error", "Unknown error")
276
  st.error(f"❌ Extraction failed: {error_msg}")
277
 
278
  st.session_state.processing = False
279
 
280
  # Chat management
281
+ if st.session_state.extracted_data:
282
  st.markdown("---")
283
  st.subheader("πŸ’¬ Chat Management")
284
+ if st.button("πŸ—‘οΈ Clear Chat", type="secondary", use_container_width=True):
285
+ st.session_state.chat_history = []
286
+ st.session_state.last_user_input = ""
287
+ st.success("πŸ—‘οΈ Chat history cleared!")
288
+
289
+ # Main content area
 
 
 
 
 
 
 
 
290
  st.markdown("### πŸ“Š Extraction Results")
291
 
292
  if st.session_state.processing:
 
302
  # Display metrics
303
  display_metrics(data)
304
 
305
+ # Display page info and sample content in columns
306
  col1, col2 = st.columns(2)
307
 
308
  with col1:
309
  st.markdown("#### 🏷️ Page Information")
310
  st.write(f"**Title:** {page_info['title']}")
311
  st.write(f"**URL:** {page_info['url']}")
312
+ st.write(f"**Type:** {data['data_type'].title()}")
313
  st.write(f"**Content Blocks:** {len(content_blocks)}")
314
+ st.write(f"**Extracted:** {data['extraction_time']}")
315
 
316
  with col2:
 
317
  st.markdown("#### πŸ“ Sample Content")
318
  for i, block in enumerate(content_blocks[:3]):
319
+ with st.expander(f"Block {i+1} ({len(block.split())} words)"):
320
  st.write(block)
321
 
322
  if len(content_blocks) > 3:
323
+ st.info(f"πŸ“„ +{len(content_blocks) - 3} more blocks")
324
 
325
  else:
326
  st.info("""
327
  πŸ‘‹ **Welcome to LinkedIn AI Analyzer!**
328
 
329
  **To get started:**
330
+ 1. Select content type in sidebar
331
+ 2. Enter a LinkedIn URL or click suggested company
332
+ 3. Click "Extract & Analyze"
333
+ 4. Chat with the AI below about the extracted content
334
 
335
  **Supported URLs:**
336
  - πŸ‘€ Public Profiles
337
  - 🏒 Company Pages
338
  - πŸ“ Public Posts
 
 
 
 
 
 
339
  """)
340
 
341
+ # Chat section
342
  st.markdown("---")
343
+ st.markdown("### πŸ’¬ Chat with AI")
344
 
345
+ has_data = st.session_state.extracted_data and st.session_state.extracted_data.get("status") == "success"
346
 
347
+ if has_data:
348
+ st.success("πŸ’¬ Chat ready! Ask questions about the LinkedIn data below.")
349
 
350
+ # Display chat history - ONLY ONCE
351
  for chat in st.session_state.chat_history:
352
  if chat["role"] == "user":
353
  with st.chat_message("user"):
 
356
  with st.chat_message("assistant"):
357
  st.write(chat['content'])
358
 
359
+ # Suggested questions when no history
360
  if len(st.session_state.chat_history) == 0:
361
  st.markdown("#### πŸ’‘ Try asking:")
362
  suggestions = [
363
+ "What is this post about?",
364
+ "Summarize this content",
365
+ "What projects are mentioned?",
366
+ "Tell me about the GitHub profile"
 
367
  ]
368
 
369
  cols = st.columns(len(suggestions))
370
  for i, suggestion in enumerate(suggestions):
371
  with cols[i]:
372
+ if st.button(suggestion, key=f"sugg_{i}", use_container_width=True):
373
+ st.info(f"πŸ’‘ Type: '{suggestion}' in the chat below")
 
 
 
 
 
 
374
 
375
+ # CHAT INPUT - WITH DUPLICATION PROTECTION
376
+ if has_data:
377
+ user_input = st.chat_input("Type your question about the LinkedIn data here...")
378
 
379
+ if user_input and user_input != st.session_state.last_user_input:
380
+ # Store the current input to prevent duplication
381
+ st.session_state.last_user_input = user_input
382
+
383
+ # Add user message
384
  st.session_state.chat_history.append({"role": "user", "content": user_input})
385
 
386
+ # Generate and add AI response
387
+ with st.spinner("πŸ€” Analyzing..."):
388
+ response = enhanced_chat_analysis(user_input, st.session_state.extracted_data)
389
+ st.session_state.chat_history.append({"role": "assistant", "content": response})
390
+
391
+ # Force rerun to show updated chat
392
+ st.rerun()
393
+
394
+ # Features section at bottom
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  st.markdown("---")
396
+ st.markdown("### πŸš€ Features")
397
 
398
  feature_cols = st.columns(3)
399
 
400
  with feature_cols[0]:
401
  st.markdown("""
402
+ **πŸ“Š Data Extraction**
403
+ - LinkedIn content scraping
404
  - Text processing
405
  - Content analysis
406
  """)
 
408
  with feature_cols[1]:
409
  st.markdown("""
410
  **πŸ’¬ Smart Chat**
411
+ - Interactive Q&A
412
+ - Content analysis
413
+ - Professional insights
414
  """)
415
 
416
  with feature_cols[2]:
417
  st.markdown("""
418
  **πŸ” Insights**
419
+ - Summary generation
420
+ - Skill detection
421
+ - Experience analysis
422
  """)
423
 
424
  if __name__ == "__main__":