Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

App Files Files Community

Refat81 commited on Oct 21, 2025

Commit

0ce219f

verified ·

1 Parent(s): 7dbea31

Update pages/linkedin_extractor.py

Browse files

Files changed (1) hide show

pages/linkedin_extractor.py +64 -226

pages/linkedin_extractor.py CHANGED Viewed

@@ -19,17 +19,6 @@ st.set_page_config(
     layout="wide"
 )
-st.markdown("""
-<style>
-    .stApp { background-color: #0e1117; color: white; }
-    .main-header { background: #0077B5; color: white; padding: 1.5rem; border-radius: 8px; margin-bottom: 1.5rem; text-align: center; }
-    .stButton>button { background-color: #0077b5; color: white; border: none; border-radius: 4px; padding: 8px 16px; width: 100%; }
-    .stTextInput>div>div>input { background-color: #262730; color: white; border: 1px solid #555; }
-    .stSelectbox>div>div>select { background-color: #262730; color: white; }
-    .stTextArea textarea { background-color: #262730; color: white; }
-</style>
-""", unsafe_allow_html=True)
 def get_embeddings():
     try:
         embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
@@ -58,89 +47,57 @@ def get_llm():
 def extract_linkedin_data(url, data_type):
     try:
         headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
         }
-        st.info(f"🔗 Accessing: {url}")
         response = requests.get(url, headers=headers, timeout=15)
         if response.status_code != 200:
             return f"❌ Failed to access page (Status: {response.status_code})"
         soup = BeautifulSoup(response.text, 'html.parser')
-        # Remove scripts and styles
         for script in soup(["script", "style"]):
             script.decompose()
-        # Extract text and clean it
         text = soup.get_text()
         lines = (line.strip() for line in text.splitlines())
         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
         text = ' '.join(chunk for chunk in chunks if chunk)
-        # Extract meaningful content
         paragraphs = text.split('.')
         meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50]
         if not meaningful_content:
-            return "❌ No meaningful content found. The page might require login or have restricted access."
-        # Structure the result
-        if data_type == "profile":
-            result = "👤 LINKEDIN PROFILE DATA\n\n"
-        elif data_type == "company":
-            result = "🏢 LINKEDIN COMPANY DATA\n\n"
-        else:
-            result = "📝 LINKEDIN POST DATA\n\n"
-        result += f"🔗 URL: {url}\n"
-        result += f"📊 Type: {data_type.upper()}\n"
-        result += f"⏰ Extracted: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"
-        result += "="*60 + "\n\n"
-        # Add extracted content
-        for i, content in enumerate(meaningful_content[:15], 1):
-            result += f"📄 Content Block {i}:\n"
-            result += f"{content}\n"
-            result += "-" * 40 + "\n\n"
-        result += "="*60 + "\n"
-        result += f"✅ Successfully extracted {len(meaningful_content)} content blocks\n"
-        result += f"📝 Total characters: {len(text):,}\n"
         return result
-    except requests.exceptions.Timeout:
-        return "❌ Error: Request timed out. Please try again."
-    except requests.exceptions.ConnectionError:
-        return "❌ Error: Connection failed. Check your internet connection."
     except Exception as e:
         return f"❌ Error: {str(e)}"
 def get_text_chunks(text):
     if not text.strip():
         return []
-    splitter = CharacterTextSplitter(
-        separator="\n",
-        chunk_size=1000,
-        chunk_overlap=200,
-        length_function=len
-    )
     return splitter.split_text(text)
 def get_vectorstore(text_chunks):
     if not text_chunks:
         return None
-    try:
-        documents = [Document(page_content=chunk) for chunk in text_chunks]
-        embeddings = get_embeddings()
-        if embeddings is None:
-            return None
-        vectorstore = FAISS.from_documents(documents, embeddings)
-        return vectorstore
-    except Exception as e:
-        st.error(f"❌ Vector store creation failed: {e}")
         return None
 def get_conversation_chain(vectorstore):
     if vectorstore is None:
@@ -150,56 +107,23 @@ def get_conversation_chain(vectorstore):
         if llm is None:
             return None
-        memory = ConversationBufferMemory(
-            memory_key="chat_history",
-            return_messages=True,
-            output_key="answer"
-        )
         chain = ConversationalRetrievalChain.from_llm(
             llm=llm,
             retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
             memory=memory,
-            return_source_documents=True,
-            output_key="answer"
         )
         return chain
     except Exception as e:
-        st.error(f"❌ Conversation chain error: {e}")
         return None
-def clear_chat_history():
-    """Clear chat history while keeping extracted data"""
-    if "vectorstore" in st.session_state and st.session_state.vectorstore:
-        st.session_state.chat_history = []
-        st.session_state.conversation = get_conversation_chain(st.session_state.vectorstore)
-        st.success("🔄 Chat history cleared! Starting fresh conversation.")
-    else:
-        st.error("❌ No data available to chat with.")
 def main():
-    st.markdown("""
-    <div class="main-header">
-        <h1>💼 LinkedIn AI Analyzer</h1>
-        <p>Professional Version - Powered by HuggingFace</p>
-    </div>
-    """, unsafe_allow_html=True)
-    if st.button("← Back to Main Dashboard", use_container_width=True):
-        st.switch_page("main_dashboard.py")
-    # Check API key
-    if not os.getenv('HUGGINGFACEHUB_API_TOKEN'):
-        st.error("""
-        ❌ HuggingFace API Key not configured!
-        Please add your API key to Hugging Face Space settings:
-        1. Go to your Space Settings
-        2. Click "Repository Secrets"
-        3. Add: `HUGGINGFACEHUB_API_TOKEN = "your_token_here"`
-        4. Restart the Space
-        """)
-        return
     # Initialize session state
     if "conversation" not in st.session_state:
@@ -210,20 +134,10 @@ def main():
         st.session_state.processed = False
     if "extracted_data" not in st.session_state:
         st.session_state.extracted_data = ""
-    if "vectorstore" not in st.session_state:
-        st.session_state.vectorstore = None
-    if "current_url" not in st.session_state:
-        st.session_state.current_url = ""
     # Sidebar
     with st.sidebar:
-        st.success("✅ HuggingFace API Active")
-        data_type = st.selectbox(
-            "📊 Content Type",
-            ["profile", "company", "post"],
-            help="Select the type of LinkedIn content you want to analyze"
-        )
         url_placeholder = {
             "profile": "https://www.linkedin.com/in/username/",
@@ -231,148 +145,72 @@ def main():
             "post": "https://www.linkedin.com/posts/username_postid/"
         }
-        linkedin_url = st.text_input(
-            "🌐 LinkedIn URL",
-            placeholder=url_placeholder[data_type],
-            help="Enter a public LinkedIn URL (profile, company, or post)"
-        )
-        col1, col2 = st.columns(2)
-        with col1:
-            if st.button("🚀 Extract & Analyze", type="primary", use_container_width=True):
-                if not linkedin_url.strip():
-                    st.warning("⚠️ Please enter a LinkedIn URL")
-                elif not linkedin_url.startswith('https://www.linkedin.com/'):
-                    st.error("❌ Please enter a valid LinkedIn URL")
-                else:
-                    with st.spinner("🔄 Extracting data from LinkedIn..."):
-                        extracted_data = extract_linkedin_data(linkedin_url, data_type)
-                        if extracted_data and not extracted_data.startswith("❌"):
-                            # Process the data
-                            chunks = get_text_chunks(extracted_data)
-                            if chunks:
-                                vectorstore = get_vectorstore(chunks)
-                                conversation = get_conversation_chain(vectorstore)
-                                if conversation:
-                                    st.session_state.conversation = conversation
-                                    st.session_state.vectorstore = vectorstore
-                                    st.session_state.processed = True
-                                    st.session_state.extracted_data = extracted_data
-                                    st.session_state.chat_history = []
-                                    st.session_state.current_url = linkedin_url
-                                    st.success(f"✅ Successfully processed {len(chunks)} content chunks!")
-                                else:
-                                    st.error("❌ Failed to initialize AI conversation")
                             else:
-                                st.error("❌ No meaningful content could be extracted")
                         else:
-                            st.error(extracted_data)
-        with col2:
-            if st.session_state.processed:
-                if st.button("🗑️ Clear Chat", type="secondary", use_container_width=True):
-                    clear_chat_history()
-        # Display extraction info
-        if st.session_state.processed:
-            st.markdown("---")
-            st.subheader("📊 Extraction Info")
-            st.write(f"**Type:** {data_type.title()}")
-            st.write(f"**URL:** {st.session_state.current_url[:50]}...")
-            if st.session_state.extracted_data:
-                chunks = get_text_chunks(st.session_state.extracted_data)
-                st.write(f"**Chunks:** {len(chunks)}")
-                st.write(f"**Characters:** {len(st.session_state.extracted_data):,}")
-    # Main content area
     col1, col2 = st.columns([2, 1])
     with col1:
-        st.markdown("### 💬 AI Conversation")
-        # Display chat history
         for i, chat in enumerate(st.session_state.chat_history):
             if chat["role"] == "user":
-                with st.chat_message("user"):
-                    st.write(chat["content"])
             elif chat["role"] == "assistant":
-                with st.chat_message("assistant"):
-                    st.write(chat["content"])
-        # Chat input
         if st.session_state.processed:
             user_input = st.chat_input("Ask about the LinkedIn data...")
             if user_input:
-                # Add user message to chat
                 st.session_state.chat_history.append({"role": "user", "content": user_input})
-                with st.chat_message("user"):
-                    st.write(user_input)
-                # Generate AI response
-                with st.chat_message("assistant"):
-                    with st.spinner("🤔 Analyzing content..."):
-                        try:
-                            if st.session_state.conversation:
-                                response = st.session_state.conversation.invoke({"question": user_input})
-                                answer = response.get("answer", "I couldn't generate a response based on the available data.")
-                                st.write(answer)
-                                st.session_state.chat_history.append({"role": "assistant", "content": answer})
-                            else:
-                                error_msg = "❌ Conversation not initialized. Please extract data first."
-                                st.write(error_msg)
-                                st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
-                        except Exception as e:
-                            error_msg = f"❌ Error generating response: {str(e)}"
-                            st.write(error_msg)
-                            st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
         else:
-            st.info("""
-            👋 **Welcome to LinkedIn AI Analyzer!**
-            **To get started:**
-            1. Select content type in sidebar
-            2. Enter a LinkedIn URL
-            3. Click "Extract & Analyze"
-            4. Chat with the AI about the content
-            **Supported URLs:**
-            - 👤 Profiles: `https://www.linkedin.com/in/username/`
-            - 🏢 Companies: `https://www.linkedin.com/company/companyname/`
-            - 📝 Posts: `https://www.linkedin.com/posts/username_postid/`
-            **Note:** Only public profiles and content are accessible.
-            """)
     with col2:
-        st.markdown("### 📈 Analytics")
         if st.session_state.processed:
             data = st.session_state.extracted_data
             chunks = get_text_chunks(data)
             st.metric("Content Type", data_type.title())
-            st.metric("Content Chunks", len(chunks))
-            st.metric("Total Characters", f"{len(data):,}")
-            st.metric("Conversation Turns", len(st.session_state.chat_history) // 2)
-            st.markdown("### 💡 Suggested Questions")
-            suggestions = [
-                "Summarize the main information",
-                "What are the key skills or experiences?",
-                "Tell me about the company overview",
-                "What's the main content of this post?",
-                "Extract important achievements"
-            ]
-            for suggestion in suggestions:
-                if st.button(suggestion, key=f"suggest_{suggestion}", use_container_width=True):
-                    st.info(f"💡 Try asking: '{suggestion}'")
-        else:
-            st.info("📊 Analytics will appear here after data extraction")
 if __name__ == "__main__":
     main()

     layout="wide"
 )
 def get_embeddings():
     try:
         embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 def extract_linkedin_data(url, data_type):
     try:
         headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
         }
         response = requests.get(url, headers=headers, timeout=15)
         if response.status_code != 200:
             return f"❌ Failed to access page (Status: {response.status_code})"
         soup = BeautifulSoup(response.text, 'html.parser')
         for script in soup(["script", "style"]):
             script.decompose()
         text = soup.get_text()
         lines = (line.strip() for line in text.splitlines())
         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
         text = ' '.join(chunk for chunk in chunks if chunk)
         paragraphs = text.split('.')
         meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50]
         if not meaningful_content:
+            return "❌ No meaningful content found."
+        result = f"🔗 URL: {url}\n"
+        result += "="*50 + "\n\n"
+        for i, content in enumerate(meaningful_content[:10], 1):
+            result += f"{i}. {content}\n\n"
+        result += "="*50 + "\n"
+        result += f"✅ Extracted {len(meaningful_content)} content blocks\n"
         return result
     except Exception as e:
         return f"❌ Error: {str(e)}"
 def get_text_chunks(text):
     if not text.strip():
         return []
+    splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
     return splitter.split_text(text)
 def get_vectorstore(text_chunks):
     if not text_chunks:
         return None
+    documents = [Document(page_content=chunk) for chunk in text_chunks]
+    embeddings = get_embeddings()
+    if embeddings is None:
         return None
+    vectorstore = FAISS.from_documents(documents, embeddings)
+    return vectorstore
 def get_conversation_chain(vectorstore):
     if vectorstore is None:
         if llm is None:
             return None
+        memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
         chain = ConversationalRetrievalChain.from_llm(
             llm=llm,
             retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
             memory=memory,
+            return_source_documents=True
         )
         return chain
     except Exception as e:
+        st.error(f"❌ Error: {e}")
         return None
 def main():
+    st.title("💼 LinkedIn AI Analyzer")
+    if st.button("← Back to Main Dashboard"):
+        st.switch_page("app.py")
     # Initialize session state
     if "conversation" not in st.session_state:
         st.session_state.processed = False
     if "extracted_data" not in st.session_state:
         st.session_state.extracted_data = ""
     # Sidebar
     with st.sidebar:
+        data_type = st.selectbox("📊 Content Type", ["profile", "company", "post"])
         url_placeholder = {
             "profile": "https://www.linkedin.com/in/username/",
             "post": "https://www.linkedin.com/posts/username_postid/"
         }
+        linkedin_url = st.text_input("🌐 LinkedIn URL", placeholder=url_placeholder[data_type])
+        if st.button("🚀 Extract & Analyze", type="primary"):
+            if not linkedin_url.strip():
+                st.warning("Please enter a LinkedIn URL")
+            else:
+                with st.spinner("🔄 Extracting data..."):
+                    extracted_data = extract_linkedin_data(linkedin_url, data_type)
+                    if extracted_data and not extracted_data.startswith("❌"):
+                        chunks = get_text_chunks(extracted_data)
+                        if chunks:
+                            vectorstore = get_vectorstore(chunks)
+                            conversation = get_conversation_chain(vectorstore)
+                            if conversation:
+                                st.session_state.conversation = conversation
+                                st.session_state.processed = True
+                                st.session_state.extracted_data = extracted_data
+                                st.session_state.chat_history = []
+                                st.success(f"✅ Ready to analyze {len(chunks)} content chunks!")
                             else:
+                                st.error("❌ Failed to initialize AI")
                         else:
+                            st.error("❌ No content extracted")
+                    else:
+                        st.error(extracted_data)
+    # Main content
     col1, col2 = st.columns([2, 1])
     with col1:
+        st.markdown("### 💬 Chat")
         for i, chat in enumerate(st.session_state.chat_history):
             if chat["role"] == "user":
+                st.markdown(f"**👤 You:** {chat['content']}")
             elif chat["role"] == "assistant":
+                if chat["content"]:
+                    st.markdown(f"**🤖 Assistant:** {chat['content']}")
         if st.session_state.processed:
             user_input = st.chat_input("Ask about the LinkedIn data...")
             if user_input:
                 st.session_state.chat_history.append({"role": "user", "content": user_input})
+                with st.spinner("🤔 Analyzing..."):
+                    try:
+                        if st.session_state.conversation:
+                            response = st.session_state.conversation.invoke({"question": user_input})
+                            answer = response.get("answer", "No response generated.")
+                            st.session_state.chat_history.append({"role": "assistant", "content": answer})
+                            st.rerun()
+                    except Exception as e:
+                        st.session_state.chat_history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
+                        st.rerun()
         else:
+            st.info("👋 Enter a LinkedIn URL and click 'Extract & Analyze' to start")
     with col2:
         if st.session_state.processed:
+            st.markdown("### 📊 Overview")
             data = st.session_state.extracted_data
             chunks = get_text_chunks(data)
             st.metric("Content Type", data_type.title())
+            st.metric("Text Chunks", len(chunks))
+            st.metric("Characters", f"{len(data):,}")
 if __name__ == "__main__":
     main()