Refat81 commited on
Commit
069aef5
Β·
verified Β·
1 Parent(s): 93a0730

Update pages/linkedin_extractor.py

Browse files
Files changed (1) hide show
  1. pages/linkedin_extractor.py +254 -70
pages/linkedin_extractor.py CHANGED
@@ -1,10 +1,10 @@
 
1
  import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from langchain_text_splitters import CharacterTextSplitter
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_community.vectorstores import FAISS
7
- from langchain_community.chat_models import ChatOpenAI
8
  from langchain.memory import ConversationBufferMemory
9
  from langchain.chains import ConversationalRetrievalChain
10
  from langchain_core.documents import Document
@@ -20,24 +20,40 @@ st.set_page_config(
20
  )
21
 
22
  def get_embeddings():
 
23
  try:
24
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
 
25
  return embeddings
26
  except Exception as e:
27
  st.error(f"❌ Failed to load embeddings: {e}")
 
28
  return None
29
 
30
  def get_llm():
 
31
  try:
32
  api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
33
  if not api_key:
34
- st.error("❌ HuggingFace API Key not found in environment variables")
 
 
 
 
 
 
 
35
  return None
36
 
37
  llm = HuggingFaceHub(
38
  repo_id="google/flan-t5-large",
39
  huggingfacehub_api_token=api_key,
40
- model_kwargs={"temperature": 0.7, "max_length": 500}
 
 
 
 
41
  )
42
  return llm
43
  except Exception as e:
@@ -45,86 +61,149 @@ def get_llm():
45
  return None
46
 
47
  def extract_linkedin_data(url, data_type):
 
48
  try:
49
  headers = {
50
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
51
  }
52
 
53
- response = requests.get(url, headers=headers, timeout=15)
 
 
54
  if response.status_code != 200:
55
  return f"❌ Failed to access page (Status: {response.status_code})"
56
 
57
  soup = BeautifulSoup(response.text, 'html.parser')
58
- for script in soup(["script", "style"]):
 
 
59
  script.decompose()
60
 
 
61
  text = soup.get_text()
62
  lines = (line.strip() for line in text.splitlines())
63
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
64
- text = ' '.join(chunk for chunk in chunks if chunk)
65
 
66
- paragraphs = text.split('.')
67
- meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50]
68
 
69
- if not meaningful_content:
70
- return "❌ No meaningful content found."
71
 
72
- result = f"πŸ”— URL: {url}\n"
73
- result += "="*50 + "\n\n"
 
 
 
 
 
 
74
 
75
- for i, content in enumerate(meaningful_content[:10], 1):
76
- result += f"{i}. {content}\n\n"
 
 
 
77
 
78
- result += "="*50 + "\n"
79
- result += f"βœ… Extracted {len(meaningful_content)} content blocks\n"
 
80
 
81
  return result
82
 
 
 
 
 
83
  except Exception as e:
84
  return f"❌ Error: {str(e)}"
85
 
86
  def get_text_chunks(text):
 
87
  if not text.strip():
88
  return []
89
- splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
 
 
 
 
 
 
90
  return splitter.split_text(text)
91
 
92
  def get_vectorstore(text_chunks):
 
93
  if not text_chunks:
94
  return None
95
- documents = [Document(page_content=chunk) for chunk in text_chunks]
96
- embeddings = get_embeddings()
97
- if embeddings is None:
 
 
 
 
 
 
 
 
 
98
  return None
99
- vectorstore = FAISS.from_documents(documents, embeddings)
100
- return vectorstore
101
 
102
  def get_conversation_chain(vectorstore):
 
103
  if vectorstore is None:
104
  return None
 
105
  try:
106
  llm = get_llm()
107
  if llm is None:
108
  return None
109
 
110
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
 
 
 
 
 
111
  chain = ConversationalRetrievalChain.from_llm(
112
  llm=llm,
113
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
114
  memory=memory,
115
- return_source_documents=True
 
116
  )
117
  return chain
118
  except Exception as e:
119
- st.error(f"❌ Error: {e}")
120
  return None
121
 
 
 
 
 
 
 
 
122
  def main():
123
  st.title("πŸ’Ό LinkedIn AI Analyzer")
124
 
125
  if st.button("← Back to Main Dashboard"):
126
  st.switch_page("app.py")
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  # Initialize session state
129
  if "conversation" not in st.session_state:
130
  st.session_state.conversation = None
@@ -134,83 +213,188 @@ def main():
134
  st.session_state.processed = False
135
  if "extracted_data" not in st.session_state:
136
  st.session_state.extracted_data = ""
 
 
 
 
137
 
138
  # Sidebar
139
  with st.sidebar:
140
- data_type = st.selectbox("πŸ“Š Content Type", ["profile", "company", "post"])
141
 
 
 
 
 
 
 
 
 
142
  url_placeholder = {
143
  "profile": "https://www.linkedin.com/in/username/",
144
  "company": "https://www.linkedin.com/company/companyname/",
145
  "post": "https://www.linkedin.com/posts/username_postid/"
146
  }
147
 
148
- linkedin_url = st.text_input("🌐 LinkedIn URL", placeholder=url_placeholder[data_type])
149
-
150
- if st.button("πŸš€ Extract & Analyze", type="primary"):
151
- if not linkedin_url.strip():
152
- st.warning("Please enter a LinkedIn URL")
153
- else:
154
- with st.spinner("πŸ”„ Extracting data..."):
155
- extracted_data = extract_linkedin_data(linkedin_url, data_type)
156
-
157
- if extracted_data and not extracted_data.startswith("❌"):
158
- chunks = get_text_chunks(extracted_data)
159
- if chunks:
160
- vectorstore = get_vectorstore(chunks)
161
- conversation = get_conversation_chain(vectorstore)
162
- if conversation:
163
- st.session_state.conversation = conversation
164
- st.session_state.processed = True
165
- st.session_state.extracted_data = extracted_data
166
- st.session_state.chat_history = []
167
- st.success(f"βœ… Ready to analyze {len(chunks)} content chunks!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  else:
169
- st.error("❌ Failed to initialize AI")
170
  else:
171
- st.error("❌ No content extracted")
172
- else:
173
- st.error(extracted_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
- # Main content
176
  col1, col2 = st.columns([2, 1])
177
 
178
  with col1:
179
- st.markdown("### πŸ’¬ Chat")
180
 
 
181
  for i, chat in enumerate(st.session_state.chat_history):
182
  if chat["role"] == "user":
183
- st.markdown(f"**πŸ‘€ You:** {chat['content']}")
 
184
  elif chat["role"] == "assistant":
185
- if chat["content"]:
186
- st.markdown(f"**πŸ€– Assistant:** {chat['content']}")
187
 
188
- if st.session_state.processed:
 
189
  user_input = st.chat_input("Ask about the LinkedIn data...")
 
190
  if user_input:
 
191
  st.session_state.chat_history.append({"role": "user", "content": user_input})
192
- with st.spinner("πŸ€” Analyzing..."):
193
- try:
194
- if st.session_state.conversation:
 
 
 
 
 
195
  response = st.session_state.conversation.invoke({"question": user_input})
196
- answer = response.get("answer", "No response generated.")
 
 
197
  st.session_state.chat_history.append({"role": "assistant", "content": answer})
198
- st.rerun()
199
- except Exception as e:
200
- st.session_state.chat_history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
201
- st.rerun()
 
 
 
202
  else:
203
- st.info("πŸ‘‹ Enter a LinkedIn URL and click 'Extract & Analyze' to start")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  with col2:
 
 
206
  if st.session_state.processed:
207
- st.markdown("### πŸ“Š Overview")
208
  data = st.session_state.extracted_data
209
  chunks = get_text_chunks(data)
210
 
211
  st.metric("Content Type", data_type.title())
212
- st.metric("Text Chunks", len(chunks))
213
- st.metric("Characters", f"{len(data):,}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  if __name__ == "__main__":
216
  main()
 
1
+ # pages/linkedin_extractor.py
2
  import streamlit as st
3
  import requests
4
  from bs4 import BeautifulSoup
5
  from langchain_text_splitters import CharacterTextSplitter
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
  from langchain_community.vectorstores import FAISS
 
8
  from langchain.memory import ConversationBufferMemory
9
  from langchain.chains import ConversationalRetrievalChain
10
  from langchain_core.documents import Document
 
20
  )
21
 
22
  def get_embeddings():
23
+ """Initialize HuggingFace embeddings with fallback"""
24
  try:
25
+ embeddings = HuggingFaceEmbeddings(
26
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
27
+ )
28
  return embeddings
29
  except Exception as e:
30
  st.error(f"❌ Failed to load embeddings: {e}")
31
+ st.info("πŸ”§ Please make sure 'sentence-transformers' is in requirements.txt")
32
  return None
33
 
34
  def get_llm():
35
+ """Initialize HuggingFace LLM"""
36
  try:
37
  api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
38
  if not api_key:
39
+ st.error("""
40
+ ❌ HuggingFace API Key not found!
41
+
42
+ Please add your API key:
43
+ 1. Go to Space Settings β†’ Variables and Secrets
44
+ 2. Add: HUGGINGFACEHUB_API_TOKEN = "your_hf_token_here"
45
+ 3. Restart the Space
46
+ """)
47
  return None
48
 
49
  llm = HuggingFaceHub(
50
  repo_id="google/flan-t5-large",
51
  huggingfacehub_api_token=api_key,
52
+ model_kwargs={
53
+ "temperature": 0.7,
54
+ "max_length": 512,
55
+ "max_new_tokens": 256
56
+ }
57
  )
58
  return llm
59
  except Exception as e:
 
61
  return None
62
 
63
  def extract_linkedin_data(url, data_type):
64
+ """Extract data from LinkedIn URLs"""
65
  try:
66
  headers = {
67
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
68
  }
69
 
70
+ st.info(f"🌐 Accessing: {url}")
71
+ response = requests.get(url, headers=headers, timeout=20)
72
+
73
  if response.status_code != 200:
74
  return f"❌ Failed to access page (Status: {response.status_code})"
75
 
76
  soup = BeautifulSoup(response.text, 'html.parser')
77
+
78
+ # Remove scripts and styles
79
+ for script in soup(["script", "style", "meta", "link"]):
80
  script.decompose()
81
 
82
+ # Extract and clean text
83
  text = soup.get_text()
84
  lines = (line.strip() for line in text.splitlines())
85
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
86
+ clean_text = ' '.join(chunk for chunk in chunks if chunk)
87
 
88
+ # Extract meaningful content
89
+ paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 30]
90
 
91
+ if not paragraphs:
92
+ return "❌ No meaningful content found. The page might require login."
93
 
94
+ # Structure the result
95
+ result = f"πŸ”— LINKEDIN DATA EXTRACTION\n"
96
+ result += "=" * 60 + "\n\n"
97
+ result += f"πŸ“„ URL: {url}\n"
98
+ result += f"πŸ“Š Type: {data_type.upper()}\n"
99
+ result += f"⏰ Extracted: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"
100
+ result += f"πŸ“ Content Blocks: {len(paragraphs)}\n"
101
+ result += "=" * 60 + "\n\n"
102
 
103
+ # Add extracted content
104
+ for i, content in enumerate(paragraphs[:15], 1):
105
+ result += f"πŸ“„ Block {i}:\n"
106
+ result += f"{content}\n"
107
+ result += "-" * 40 + "\n\n"
108
 
109
+ result += "=" * 60 + "\n"
110
+ result += f"βœ… Successfully extracted {len(paragraphs)} content blocks\n"
111
+ result += f"πŸ“Š Total characters: {len(clean_text):,}\n"
112
 
113
  return result
114
 
115
+ except requests.exceptions.Timeout:
116
+ return "❌ Error: Request timed out. Please try again."
117
+ except requests.exceptions.ConnectionError:
118
+ return "❌ Error: Connection failed. Please check the URL."
119
  except Exception as e:
120
  return f"❌ Error: {str(e)}"
121
 
122
  def get_text_chunks(text):
123
+ """Split text into chunks"""
124
  if not text.strip():
125
  return []
126
+
127
+ splitter = CharacterTextSplitter(
128
+ separator="\n",
129
+ chunk_size=800,
130
+ chunk_overlap=150,
131
+ length_function=len
132
+ )
133
  return splitter.split_text(text)
134
 
135
  def get_vectorstore(text_chunks):
136
+ """Create vector store from text chunks"""
137
  if not text_chunks:
138
  return None
139
+
140
+ try:
141
+ documents = [Document(page_content=chunk) for chunk in text_chunks]
142
+ embeddings = get_embeddings()
143
+
144
+ if embeddings is None:
145
+ return None
146
+
147
+ vectorstore = FAISS.from_documents(documents, embeddings)
148
+ return vectorstore
149
+ except Exception as e:
150
+ st.error(f"❌ Vector store creation failed: {e}")
151
  return None
 
 
152
 
153
  def get_conversation_chain(vectorstore):
154
+ """Create conversational chain"""
155
  if vectorstore is None:
156
  return None
157
+
158
  try:
159
  llm = get_llm()
160
  if llm is None:
161
  return None
162
 
163
+ memory = ConversationBufferMemory(
164
+ memory_key="chat_history",
165
+ return_messages=True,
166
+ output_key="answer"
167
+ )
168
+
169
  chain = ConversationalRetrievalChain.from_llm(
170
  llm=llm,
171
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
172
  memory=memory,
173
+ return_source_documents=True,
174
+ output_key="answer"
175
  )
176
  return chain
177
  except Exception as e:
178
+ st.error(f"❌ Conversation chain error: {e}")
179
  return None
180
 
181
+ def clear_chat_history():
182
+ """Clear chat history while keeping extracted data"""
183
+ if "vectorstore" in st.session_state and st.session_state.vectorstore:
184
+ st.session_state.chatbot = get_conversation_chain(st.session_state.vectorstore)
185
+ st.session_state.chat_history = []
186
+ st.success("πŸ”„ Chat history cleared! Starting fresh conversation.")
187
+
188
  def main():
189
  st.title("πŸ’Ό LinkedIn AI Analyzer")
190
 
191
  if st.button("← Back to Main Dashboard"):
192
  st.switch_page("app.py")
193
 
194
+ # Check API key
195
+ if not os.getenv('HUGGINGFACEHUB_API_TOKEN'):
196
+ st.error("""
197
+ πŸ”‘ **HuggingFace API Key Required**
198
+
199
+ To enable AI features:
200
+ 1. Go to **Space Settings** β†’ **Variables and Secrets**
201
+ 2. Add: `HUGGINGFACEHUB_API_TOKEN = "your_hf_token_here"`
202
+ 3. **Restart** the Space
203
+
204
+ Get free API key from: https://huggingface.co/settings/tokens
205
+ """)
206
+
207
  # Initialize session state
208
  if "conversation" not in st.session_state:
209
  st.session_state.conversation = None
 
213
  st.session_state.processed = False
214
  if "extracted_data" not in st.session_state:
215
  st.session_state.extracted_data = ""
216
+ if "vectorstore" not in st.session_state:
217
+ st.session_state.vectorstore = None
218
+ if "current_url" not in st.session_state:
219
+ st.session_state.current_url = ""
220
 
221
  # Sidebar
222
  with st.sidebar:
223
+ st.markdown("### βš™οΈ Configuration")
224
 
225
+ # Data type selection
226
+ data_type = st.selectbox(
227
+ "πŸ“Š Content Type",
228
+ ["profile", "company", "post"],
229
+ help="Select the type of LinkedIn content"
230
+ )
231
+
232
+ # URL input with examples
233
  url_placeholder = {
234
  "profile": "https://www.linkedin.com/in/username/",
235
  "company": "https://www.linkedin.com/company/companyname/",
236
  "post": "https://www.linkedin.com/posts/username_postid/"
237
  }
238
 
239
+ linkedin_url = st.text_input(
240
+ "🌐 LinkedIn URL",
241
+ placeholder=url_placeholder[data_type],
242
+ help="Enter a public LinkedIn URL"
243
+ )
244
+
245
+ # Suggested URLs
246
+ st.markdown("### πŸ’‘ Try These:")
247
+ suggested_urls = {
248
+ "Microsoft": "https://www.linkedin.com/company/microsoft/",
249
+ "Google": "https://www.linkedin.com/company/google/",
250
+ "Apple": "https://www.linkedin.com/company/apple/"
251
+ }
252
+
253
+ for name, url in suggested_urls.items():
254
+ if st.button(f"🏒 {name}", key=name, use_container_width=True):
255
+ st.session_state.current_url = url
256
+ st.rerun()
257
+
258
+ # Extract button
259
+ col1, col2 = st.columns(2)
260
+ with col1:
261
+ if st.button("πŸš€ Extract & Analyze", type="primary", use_container_width=True):
262
+ url_to_use = linkedin_url.strip() or st.session_state.current_url
263
+
264
+ if not url_to_use:
265
+ st.warning("⚠️ Please enter a LinkedIn URL")
266
+ elif not url_to_use.startswith('https://www.linkedin.com/'):
267
+ st.error("❌ Please enter a valid LinkedIn URL")
268
+ else:
269
+ with st.spinner("πŸ”„ Extracting data from LinkedIn..."):
270
+ extracted_data = extract_linkedin_data(url_to_use, data_type)
271
+
272
+ if extracted_data and not extracted_data.startswith("❌"):
273
+ # Process for AI
274
+ chunks = get_text_chunks(extracted_data)
275
+ if chunks:
276
+ vectorstore = get_vectorstore(chunks)
277
+ conversation = get_conversation_chain(vectorstore)
278
+
279
+ if conversation:
280
+ st.session_state.conversation = conversation
281
+ st.session_state.vectorstore = vectorstore
282
+ st.session_state.processed = True
283
+ st.session_state.extracted_data = extracted_data
284
+ st.session_state.chat_history = []
285
+ st.session_state.current_url = url_to_use
286
+ st.success(f"βœ… Ready to analyze {len(chunks)} content chunks!")
287
+ else:
288
+ st.error("❌ Failed to initialize AI")
289
  else:
290
+ st.error("❌ No content extracted")
291
  else:
292
+ st.error(extracted_data)
293
+
294
+ with col2:
295
+ if st.session_state.processed:
296
+ if st.button("πŸ—‘οΈ Clear Chat", type="secondary", use_container_width=True):
297
+ clear_chat_history()
298
+
299
+ # Display extraction info
300
+ if st.session_state.processed:
301
+ st.markdown("---")
302
+ st.markdown("### πŸ“Š Extraction Info")
303
+ st.write(f"**Type:** {data_type.title()}")
304
+ st.write(f"**URL:** {st.session_state.current_url[:50]}...")
305
+ if st.session_state.extracted_data:
306
+ chunks = get_text_chunks(st.session_state.extracted_data)
307
+ st.write(f"**Chunks:** {len(chunks)}")
308
+ st.write(f"**Characters:** {len(st.session_state.extracted_data):,}")
309
 
310
+ # Main content area
311
  col1, col2 = st.columns([2, 1])
312
 
313
  with col1:
314
+ st.markdown("### πŸ’¬ AI Conversation")
315
 
316
+ # Display chat history
317
  for i, chat in enumerate(st.session_state.chat_history):
318
  if chat["role"] == "user":
319
+ with st.chat_message("user"):
320
+ st.write(chat["content"])
321
  elif chat["role"] == "assistant":
322
+ with st.chat_message("assistant"):
323
+ st.write(chat["content"])
324
 
325
+ # Chat input
326
+ if st.session_state.processed and st.session_state.conversation:
327
  user_input = st.chat_input("Ask about the LinkedIn data...")
328
+
329
  if user_input:
330
+ # Add user message
331
  st.session_state.chat_history.append({"role": "user", "content": user_input})
332
+
333
+ with st.chat_message("user"):
334
+ st.write(user_input)
335
+
336
+ # Generate AI response
337
+ with st.chat_message("assistant"):
338
+ with st.spinner("πŸ€” Analyzing..."):
339
+ try:
340
  response = st.session_state.conversation.invoke({"question": user_input})
341
+ answer = response.get("answer", "I couldn't generate a response based on the available data.")
342
+
343
+ st.write(answer)
344
  st.session_state.chat_history.append({"role": "assistant", "content": answer})
345
+ except Exception as e:
346
+ error_msg = f"❌ Error generating response: {str(e)}"
347
+ st.write(error_msg)
348
+ st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
349
+
350
+ elif st.session_state.processed:
351
+ st.info("πŸ’¬ Extract data first to start chatting with AI")
352
  else:
353
+ st.info("""
354
+ πŸ‘‹ **Welcome to LinkedIn AI Analyzer!**
355
+
356
+ **To get started:**
357
+ 1. Select content type in sidebar
358
+ 2. Enter a LinkedIn URL or click a suggested company
359
+ 3. Click "Extract & Analyze"
360
+ 4. Chat with AI about the extracted content
361
+
362
+ **Supported URLs:**
363
+ - πŸ‘€ Profiles: `https://www.linkedin.com/in/username/`
364
+ - 🏒 Companies: `https://www.linkedin.com/company/companyname/`
365
+ - πŸ“ Posts: `https://www.linkedin.com/posts/username_postid/`
366
+
367
+ **Note:** Only public profiles and content are accessible.
368
+ """)
369
 
370
  with col2:
371
+ st.markdown("### πŸ“ˆ Analytics")
372
+
373
  if st.session_state.processed:
 
374
  data = st.session_state.extracted_data
375
  chunks = get_text_chunks(data)
376
 
377
  st.metric("Content Type", data_type.title())
378
+ st.metric("Content Chunks", len(chunks))
379
+ st.metric("Total Characters", f"{len(data):,}")
380
+ st.metric("Conversation Turns", len(st.session_state.chat_history) // 2)
381
+
382
+ # Suggested questions
383
+ if not st.session_state.chat_history:
384
+ st.markdown("### πŸ’‘ Suggested Questions")
385
+ suggestions = [
386
+ "Summarize the main information",
387
+ "What are the key skills or experiences mentioned?",
388
+ "Tell me about the company overview",
389
+ "What's the main content of this page?",
390
+ "Extract important achievements"
391
+ ]
392
+
393
+ for suggestion in suggestions:
394
+ if st.button(suggestion, key=f"suggest_{suggestion}", use_container_width=True):
395
+ st.info(f"πŸ’‘ Try asking: '{suggestion}'")
396
+ else:
397
+ st.info("πŸ“Š Analytics will appear here after data extraction")
398
 
399
  if __name__ == "__main__":
400
  main()