Refat81 commited on
Commit
0ce219f
Β·
verified Β·
1 Parent(s): 7dbea31

Update pages/linkedin_extractor.py

Browse files
Files changed (1) hide show
  1. pages/linkedin_extractor.py +64 -226
pages/linkedin_extractor.py CHANGED
@@ -19,17 +19,6 @@ st.set_page_config(
19
  layout="wide"
20
  )
21
 
22
- st.markdown("""
23
- <style>
24
- .stApp { background-color: #0e1117; color: white; }
25
- .main-header { background: #0077B5; color: white; padding: 1.5rem; border-radius: 8px; margin-bottom: 1.5rem; text-align: center; }
26
- .stButton>button { background-color: #0077b5; color: white; border: none; border-radius: 4px; padding: 8px 16px; width: 100%; }
27
- .stTextInput>div>div>input { background-color: #262730; color: white; border: 1px solid #555; }
28
- .stSelectbox>div>div>select { background-color: #262730; color: white; }
29
- .stTextArea textarea { background-color: #262730; color: white; }
30
- </style>
31
- """, unsafe_allow_html=True)
32
-
33
  def get_embeddings():
34
  try:
35
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
@@ -58,89 +47,57 @@ def get_llm():
58
  def extract_linkedin_data(url, data_type):
59
  try:
60
  headers = {
61
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
62
  }
63
 
64
- st.info(f"πŸ”— Accessing: {url}")
65
  response = requests.get(url, headers=headers, timeout=15)
66
  if response.status_code != 200:
67
  return f"❌ Failed to access page (Status: {response.status_code})"
68
 
69
  soup = BeautifulSoup(response.text, 'html.parser')
70
-
71
- # Remove scripts and styles
72
  for script in soup(["script", "style"]):
73
  script.decompose()
74
 
75
- # Extract text and clean it
76
  text = soup.get_text()
77
  lines = (line.strip() for line in text.splitlines())
78
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
79
  text = ' '.join(chunk for chunk in chunks if chunk)
80
 
81
- # Extract meaningful content
82
  paragraphs = text.split('.')
83
  meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50]
84
 
85
  if not meaningful_content:
86
- return "❌ No meaningful content found. The page might require login or have restricted access."
87
-
88
- # Structure the result
89
- if data_type == "profile":
90
- result = "πŸ‘€ LINKEDIN PROFILE DATA\n\n"
91
- elif data_type == "company":
92
- result = "🏒 LINKEDIN COMPANY DATA\n\n"
93
- else:
94
- result = "πŸ“ LINKEDIN POST DATA\n\n"
95
 
96
- result += f"πŸ”— URL: {url}\n"
97
- result += f"πŸ“Š Type: {data_type.upper()}\n"
98
- result += f"⏰ Extracted: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"
99
- result += "="*60 + "\n\n"
100
 
101
- # Add extracted content
102
- for i, content in enumerate(meaningful_content[:15], 1):
103
- result += f"πŸ“„ Content Block {i}:\n"
104
- result += f"{content}\n"
105
- result += "-" * 40 + "\n\n"
106
 
107
- result += "="*60 + "\n"
108
- result += f"βœ… Successfully extracted {len(meaningful_content)} content blocks\n"
109
- result += f"πŸ“ Total characters: {len(text):,}\n"
110
 
111
  return result
112
 
113
- except requests.exceptions.Timeout:
114
- return "❌ Error: Request timed out. Please try again."
115
- except requests.exceptions.ConnectionError:
116
- return "❌ Error: Connection failed. Check your internet connection."
117
  except Exception as e:
118
  return f"❌ Error: {str(e)}"
119
 
120
  def get_text_chunks(text):
121
  if not text.strip():
122
  return []
123
- splitter = CharacterTextSplitter(
124
- separator="\n",
125
- chunk_size=1000,
126
- chunk_overlap=200,
127
- length_function=len
128
- )
129
  return splitter.split_text(text)
130
 
131
  def get_vectorstore(text_chunks):
132
  if not text_chunks:
133
  return None
134
- try:
135
- documents = [Document(page_content=chunk) for chunk in text_chunks]
136
- embeddings = get_embeddings()
137
- if embeddings is None:
138
- return None
139
- vectorstore = FAISS.from_documents(documents, embeddings)
140
- return vectorstore
141
- except Exception as e:
142
- st.error(f"❌ Vector store creation failed: {e}")
143
  return None
 
 
144
 
145
  def get_conversation_chain(vectorstore):
146
  if vectorstore is None:
@@ -150,56 +107,23 @@ def get_conversation_chain(vectorstore):
150
  if llm is None:
151
  return None
152
 
153
- memory = ConversationBufferMemory(
154
- memory_key="chat_history",
155
- return_messages=True,
156
- output_key="answer"
157
- )
158
-
159
  chain = ConversationalRetrievalChain.from_llm(
160
  llm=llm,
161
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
162
  memory=memory,
163
- return_source_documents=True,
164
- output_key="answer"
165
  )
166
  return chain
167
  except Exception as e:
168
- st.error(f"❌ Conversation chain error: {e}")
169
  return None
170
 
171
- def clear_chat_history():
172
- """Clear chat history while keeping extracted data"""
173
- if "vectorstore" in st.session_state and st.session_state.vectorstore:
174
- st.session_state.chat_history = []
175
- st.session_state.conversation = get_conversation_chain(st.session_state.vectorstore)
176
- st.success("πŸ”„ Chat history cleared! Starting fresh conversation.")
177
- else:
178
- st.error("❌ No data available to chat with.")
179
-
180
  def main():
181
- st.markdown("""
182
- <div class="main-header">
183
- <h1>πŸ’Ό LinkedIn AI Analyzer</h1>
184
- <p>Professional Version - Powered by HuggingFace</p>
185
- </div>
186
- """, unsafe_allow_html=True)
187
 
188
- if st.button("← Back to Main Dashboard", use_container_width=True):
189
- st.switch_page("main_dashboard.py")
190
-
191
- # Check API key
192
- if not os.getenv('HUGGINGFACEHUB_API_TOKEN'):
193
- st.error("""
194
- ❌ HuggingFace API Key not configured!
195
-
196
- Please add your API key to Hugging Face Space settings:
197
- 1. Go to your Space Settings
198
- 2. Click "Repository Secrets"
199
- 3. Add: `HUGGINGFACEHUB_API_TOKEN = "your_token_here"`
200
- 4. Restart the Space
201
- """)
202
- return
203
 
204
  # Initialize session state
205
  if "conversation" not in st.session_state:
@@ -210,20 +134,10 @@ def main():
210
  st.session_state.processed = False
211
  if "extracted_data" not in st.session_state:
212
  st.session_state.extracted_data = ""
213
- if "vectorstore" not in st.session_state:
214
- st.session_state.vectorstore = None
215
- if "current_url" not in st.session_state:
216
- st.session_state.current_url = ""
217
 
218
  # Sidebar
219
  with st.sidebar:
220
- st.success("βœ… HuggingFace API Active")
221
-
222
- data_type = st.selectbox(
223
- "πŸ“Š Content Type",
224
- ["profile", "company", "post"],
225
- help="Select the type of LinkedIn content you want to analyze"
226
- )
227
 
228
  url_placeholder = {
229
  "profile": "https://www.linkedin.com/in/username/",
@@ -231,148 +145,72 @@ def main():
231
  "post": "https://www.linkedin.com/posts/username_postid/"
232
  }
233
 
234
- linkedin_url = st.text_input(
235
- "🌐 LinkedIn URL",
236
- placeholder=url_placeholder[data_type],
237
- help="Enter a public LinkedIn URL (profile, company, or post)"
238
- )
239
-
240
- col1, col2 = st.columns(2)
241
- with col1:
242
- if st.button("πŸš€ Extract & Analyze", type="primary", use_container_width=True):
243
- if not linkedin_url.strip():
244
- st.warning("⚠️ Please enter a LinkedIn URL")
245
- elif not linkedin_url.startswith('https://www.linkedin.com/'):
246
- st.error("❌ Please enter a valid LinkedIn URL")
247
- else:
248
- with st.spinner("πŸ”„ Extracting data from LinkedIn..."):
249
- extracted_data = extract_linkedin_data(linkedin_url, data_type)
250
-
251
- if extracted_data and not extracted_data.startswith("❌"):
252
- # Process the data
253
- chunks = get_text_chunks(extracted_data)
254
- if chunks:
255
- vectorstore = get_vectorstore(chunks)
256
- conversation = get_conversation_chain(vectorstore)
257
-
258
- if conversation:
259
- st.session_state.conversation = conversation
260
- st.session_state.vectorstore = vectorstore
261
- st.session_state.processed = True
262
- st.session_state.extracted_data = extracted_data
263
- st.session_state.chat_history = []
264
- st.session_state.current_url = linkedin_url
265
- st.success(f"βœ… Successfully processed {len(chunks)} content chunks!")
266
- else:
267
- st.error("❌ Failed to initialize AI conversation")
268
  else:
269
- st.error("❌ No meaningful content could be extracted")
270
  else:
271
- st.error(extracted_data)
272
-
273
- with col2:
274
- if st.session_state.processed:
275
- if st.button("πŸ—‘οΈ Clear Chat", type="secondary", use_container_width=True):
276
- clear_chat_history()
277
-
278
- # Display extraction info
279
- if st.session_state.processed:
280
- st.markdown("---")
281
- st.subheader("πŸ“Š Extraction Info")
282
- st.write(f"**Type:** {data_type.title()}")
283
- st.write(f"**URL:** {st.session_state.current_url[:50]}...")
284
- if st.session_state.extracted_data:
285
- chunks = get_text_chunks(st.session_state.extracted_data)
286
- st.write(f"**Chunks:** {len(chunks)}")
287
- st.write(f"**Characters:** {len(st.session_state.extracted_data):,}")
288
 
289
- # Main content area
290
  col1, col2 = st.columns([2, 1])
291
 
292
  with col1:
293
- st.markdown("### πŸ’¬ AI Conversation")
294
 
295
- # Display chat history
296
  for i, chat in enumerate(st.session_state.chat_history):
297
  if chat["role"] == "user":
298
- with st.chat_message("user"):
299
- st.write(chat["content"])
300
  elif chat["role"] == "assistant":
301
- with st.chat_message("assistant"):
302
- st.write(chat["content"])
303
 
304
- # Chat input
305
  if st.session_state.processed:
306
  user_input = st.chat_input("Ask about the LinkedIn data...")
307
  if user_input:
308
- # Add user message to chat
309
  st.session_state.chat_history.append({"role": "user", "content": user_input})
310
-
311
- with st.chat_message("user"):
312
- st.write(user_input)
313
-
314
- # Generate AI response
315
- with st.chat_message("assistant"):
316
- with st.spinner("πŸ€” Analyzing content..."):
317
- try:
318
- if st.session_state.conversation:
319
- response = st.session_state.conversation.invoke({"question": user_input})
320
- answer = response.get("answer", "I couldn't generate a response based on the available data.")
321
-
322
- st.write(answer)
323
- st.session_state.chat_history.append({"role": "assistant", "content": answer})
324
- else:
325
- error_msg = "❌ Conversation not initialized. Please extract data first."
326
- st.write(error_msg)
327
- st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
328
- except Exception as e:
329
- error_msg = f"❌ Error generating response: {str(e)}"
330
- st.write(error_msg)
331
- st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
332
  else:
333
- st.info("""
334
- πŸ‘‹ **Welcome to LinkedIn AI Analyzer!**
335
-
336
- **To get started:**
337
- 1. Select content type in sidebar
338
- 2. Enter a LinkedIn URL
339
- 3. Click "Extract & Analyze"
340
- 4. Chat with the AI about the content
341
-
342
- **Supported URLs:**
343
- - πŸ‘€ Profiles: `https://www.linkedin.com/in/username/`
344
- - 🏒 Companies: `https://www.linkedin.com/company/companyname/`
345
- - πŸ“ Posts: `https://www.linkedin.com/posts/username_postid/`
346
-
347
- **Note:** Only public profiles and content are accessible.
348
- """)
349
 
350
  with col2:
351
- st.markdown("### πŸ“ˆ Analytics")
352
-
353
  if st.session_state.processed:
 
354
  data = st.session_state.extracted_data
355
  chunks = get_text_chunks(data)
356
 
357
  st.metric("Content Type", data_type.title())
358
- st.metric("Content Chunks", len(chunks))
359
- st.metric("Total Characters", f"{len(data):,}")
360
- st.metric("Conversation Turns", len(st.session_state.chat_history) // 2)
361
-
362
- st.markdown("### πŸ’‘ Suggested Questions")
363
- suggestions = [
364
- "Summarize the main information",
365
- "What are the key skills or experiences?",
366
- "Tell me about the company overview",
367
- "What's the main content of this post?",
368
- "Extract important achievements"
369
- ]
370
-
371
- for suggestion in suggestions:
372
- if st.button(suggestion, key=f"suggest_{suggestion}", use_container_width=True):
373
- st.info(f"πŸ’‘ Try asking: '{suggestion}'")
374
- else:
375
- st.info("πŸ“Š Analytics will appear here after data extraction")
376
 
377
  if __name__ == "__main__":
378
  main()
 
19
  layout="wide"
20
  )
21
 
 
 
 
 
 
 
 
 
 
 
 
22
  def get_embeddings():
23
  try:
24
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
47
  def extract_linkedin_data(url, data_type):
48
  try:
49
  headers = {
50
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
51
  }
52
 
 
53
  response = requests.get(url, headers=headers, timeout=15)
54
  if response.status_code != 200:
55
  return f"❌ Failed to access page (Status: {response.status_code})"
56
 
57
  soup = BeautifulSoup(response.text, 'html.parser')
 
 
58
  for script in soup(["script", "style"]):
59
  script.decompose()
60
 
 
61
  text = soup.get_text()
62
  lines = (line.strip() for line in text.splitlines())
63
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
64
  text = ' '.join(chunk for chunk in chunks if chunk)
65
 
 
66
  paragraphs = text.split('.')
67
  meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50]
68
 
69
  if not meaningful_content:
70
+ return "❌ No meaningful content found."
 
 
 
 
 
 
 
 
71
 
72
+ result = f"πŸ”— URL: {url}\n"
73
+ result += "="*50 + "\n\n"
 
 
74
 
75
+ for i, content in enumerate(meaningful_content[:10], 1):
76
+ result += f"{i}. {content}\n\n"
 
 
 
77
 
78
+ result += "="*50 + "\n"
79
+ result += f"βœ… Extracted {len(meaningful_content)} content blocks\n"
 
80
 
81
  return result
82
 
 
 
 
 
83
  except Exception as e:
84
  return f"❌ Error: {str(e)}"
85
 
86
  def get_text_chunks(text):
87
  if not text.strip():
88
  return []
89
+ splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
 
 
 
 
 
90
  return splitter.split_text(text)
91
 
92
  def get_vectorstore(text_chunks):
93
  if not text_chunks:
94
  return None
95
+ documents = [Document(page_content=chunk) for chunk in text_chunks]
96
+ embeddings = get_embeddings()
97
+ if embeddings is None:
 
 
 
 
 
 
98
  return None
99
+ vectorstore = FAISS.from_documents(documents, embeddings)
100
+ return vectorstore
101
 
102
  def get_conversation_chain(vectorstore):
103
  if vectorstore is None:
 
107
  if llm is None:
108
  return None
109
 
110
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
 
 
 
 
 
111
  chain = ConversationalRetrievalChain.from_llm(
112
  llm=llm,
113
  retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
114
  memory=memory,
115
+ return_source_documents=True
 
116
  )
117
  return chain
118
  except Exception as e:
119
+ st.error(f"❌ Error: {e}")
120
  return None
121
 
 
 
 
 
 
 
 
 
 
122
  def main():
123
+ st.title("πŸ’Ό LinkedIn AI Analyzer")
 
 
 
 
 
124
 
125
+ if st.button("← Back to Main Dashboard"):
126
+ st.switch_page("app.py")
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  # Initialize session state
129
  if "conversation" not in st.session_state:
 
134
  st.session_state.processed = False
135
  if "extracted_data" not in st.session_state:
136
  st.session_state.extracted_data = ""
 
 
 
 
137
 
138
  # Sidebar
139
  with st.sidebar:
140
+ data_type = st.selectbox("πŸ“Š Content Type", ["profile", "company", "post"])
 
 
 
 
 
 
141
 
142
  url_placeholder = {
143
  "profile": "https://www.linkedin.com/in/username/",
 
145
  "post": "https://www.linkedin.com/posts/username_postid/"
146
  }
147
 
148
+ linkedin_url = st.text_input("🌐 LinkedIn URL", placeholder=url_placeholder[data_type])
149
+
150
+ if st.button("πŸš€ Extract & Analyze", type="primary"):
151
+ if not linkedin_url.strip():
152
+ st.warning("Please enter a LinkedIn URL")
153
+ else:
154
+ with st.spinner("πŸ”„ Extracting data..."):
155
+ extracted_data = extract_linkedin_data(linkedin_url, data_type)
156
+
157
+ if extracted_data and not extracted_data.startswith("❌"):
158
+ chunks = get_text_chunks(extracted_data)
159
+ if chunks:
160
+ vectorstore = get_vectorstore(chunks)
161
+ conversation = get_conversation_chain(vectorstore)
162
+ if conversation:
163
+ st.session_state.conversation = conversation
164
+ st.session_state.processed = True
165
+ st.session_state.extracted_data = extracted_data
166
+ st.session_state.chat_history = []
167
+ st.success(f"βœ… Ready to analyze {len(chunks)} content chunks!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  else:
169
+ st.error("❌ Failed to initialize AI")
170
  else:
171
+ st.error("❌ No content extracted")
172
+ else:
173
+ st.error(extracted_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ # Main content
176
  col1, col2 = st.columns([2, 1])
177
 
178
  with col1:
179
+ st.markdown("### πŸ’¬ Chat")
180
 
 
181
  for i, chat in enumerate(st.session_state.chat_history):
182
  if chat["role"] == "user":
183
+ st.markdown(f"**πŸ‘€ You:** {chat['content']}")
 
184
  elif chat["role"] == "assistant":
185
+ if chat["content"]:
186
+ st.markdown(f"**πŸ€– Assistant:** {chat['content']}")
187
 
 
188
  if st.session_state.processed:
189
  user_input = st.chat_input("Ask about the LinkedIn data...")
190
  if user_input:
 
191
  st.session_state.chat_history.append({"role": "user", "content": user_input})
192
+ with st.spinner("πŸ€” Analyzing..."):
193
+ try:
194
+ if st.session_state.conversation:
195
+ response = st.session_state.conversation.invoke({"question": user_input})
196
+ answer = response.get("answer", "No response generated.")
197
+ st.session_state.chat_history.append({"role": "assistant", "content": answer})
198
+ st.rerun()
199
+ except Exception as e:
200
+ st.session_state.chat_history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
201
+ st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
202
  else:
203
+ st.info("πŸ‘‹ Enter a LinkedIn URL and click 'Extract & Analyze' to start")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  with col2:
 
 
206
  if st.session_state.processed:
207
+ st.markdown("### πŸ“Š Overview")
208
  data = st.session_state.extracted_data
209
  chunks = get_text_chunks(data)
210
 
211
  st.metric("Content Type", data_type.title())
212
+ st.metric("Text Chunks", len(chunks))
213
+ st.metric("Characters", f"{len(data):,}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  if __name__ == "__main__":
216
  main()