Spaces:

Refat81
/

Social_Media_Data_Extractor_Chatbot

Sleeping

App Files Files Community

Refat81 commited on Oct 21, 2025

Commit

069aef5

verified ·

1 Parent(s): 93a0730

Update pages/linkedin_extractor.py

Browse files

Files changed (1) hide show

pages/linkedin_extractor.py +254 -70

pages/linkedin_extractor.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import streamlit as st
 import requests
 from bs4 import BeautifulSoup
 from langchain_text_splitters import CharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
-from langchain_community.chat_models import ChatOpenAI
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from langchain_core.documents import Document
@@ -20,24 +20,40 @@ st.set_page_config(
 )
 def get_embeddings():
     try:
-        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
         return embeddings
     except Exception as e:
         st.error(f"❌ Failed to load embeddings: {e}")
         return None
 def get_llm():
     try:
         api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
         if not api_key:
-            st.error("❌ HuggingFace API Key not found in environment variables")
             return None
         llm = HuggingFaceHub(
             repo_id="google/flan-t5-large",
             huggingfacehub_api_token=api_key,
-            model_kwargs={"temperature": 0.7, "max_length": 500}
         )
         return llm
     except Exception as e:
@@ -45,86 +61,149 @@ def get_llm():
         return None
 def extract_linkedin_data(url, data_type):
     try:
         headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
         }
-        response = requests.get(url, headers=headers, timeout=15)
         if response.status_code != 200:
             return f"❌ Failed to access page (Status: {response.status_code})"
         soup = BeautifulSoup(response.text, 'html.parser')
-        for script in soup(["script", "style"]):
             script.decompose()
         text = soup.get_text()
         lines = (line.strip() for line in text.splitlines())
         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-        text = ' '.join(chunk for chunk in chunks if chunk)
-        paragraphs = text.split('.')
-        meaningful_content = [p.strip() for p in paragraphs if len(p.strip()) > 50]
-        if not meaningful_content:
-            return "❌ No meaningful content found."
-        result = f"🔗 URL: {url}\n"
-        result += "="*50 + "\n\n"
-        for i, content in enumerate(meaningful_content[:10], 1):
-            result += f"{i}. {content}\n\n"
-        result += "="*50 + "\n"
-        result += f"✅ Extracted {len(meaningful_content)} content blocks\n"
         return result
     except Exception as e:
         return f"❌ Error: {str(e)}"
 def get_text_chunks(text):
     if not text.strip():
         return []
-    splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
     return splitter.split_text(text)
 def get_vectorstore(text_chunks):
     if not text_chunks:
         return None
-    documents = [Document(page_content=chunk) for chunk in text_chunks]
-    embeddings = get_embeddings()
-    if embeddings is None:
         return None
-    vectorstore = FAISS.from_documents(documents, embeddings)
-    return vectorstore
 def get_conversation_chain(vectorstore):
     if vectorstore is None:
         return None
     try:
         llm = get_llm()
         if llm is None:
             return None
-        memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
         chain = ConversationalRetrievalChain.from_llm(
             llm=llm,
             retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
             memory=memory,
-            return_source_documents=True
         )
         return chain
     except Exception as e:
-        st.error(f"❌ Error: {e}")
         return None
 def main():
     st.title("💼 LinkedIn AI Analyzer")
     if st.button("← Back to Main Dashboard"):
         st.switch_page("app.py")
     # Initialize session state
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
@@ -134,83 +213,188 @@ def main():
         st.session_state.processed = False
     if "extracted_data" not in st.session_state:
         st.session_state.extracted_data = ""
     # Sidebar
     with st.sidebar:
-        data_type = st.selectbox("📊 Content Type", ["profile", "company", "post"])
         url_placeholder = {
             "profile": "https://www.linkedin.com/in/username/",
             "company": "https://www.linkedin.com/company/companyname/",
             "post": "https://www.linkedin.com/posts/username_postid/"
         }
-        linkedin_url = st.text_input("🌐 LinkedIn URL", placeholder=url_placeholder[data_type])
-        if st.button("🚀 Extract & Analyze", type="primary"):
-            if not linkedin_url.strip():
-                st.warning("Please enter a LinkedIn URL")
-            else:
-                with st.spinner("🔄 Extracting data..."):
-                    extracted_data = extract_linkedin_data(linkedin_url, data_type)
-                    if extracted_data and not extracted_data.startswith("❌"):
-                        chunks = get_text_chunks(extracted_data)
-                        if chunks:
-                            vectorstore = get_vectorstore(chunks)
-                            conversation = get_conversation_chain(vectorstore)
-                            if conversation:
-                                st.session_state.conversation = conversation
-                                st.session_state.processed = True
-                                st.session_state.extracted_data = extracted_data
-                                st.session_state.chat_history = []
-                                st.success(f"✅ Ready to analyze {len(chunks)} content chunks!")
                             else:
-                                st.error("❌ Failed to initialize AI")
                         else:
-                            st.error("❌ No content extracted")
-                    else:
-                        st.error(extracted_data)
-    # Main content
     col1, col2 = st.columns([2, 1])
     with col1:
-        st.markdown("### 💬 Chat")
         for i, chat in enumerate(st.session_state.chat_history):
             if chat["role"] == "user":
-                st.markdown(f"**👤 You:** {chat['content']}")
             elif chat["role"] == "assistant":
-                if chat["content"]:
-                    st.markdown(f"**🤖 Assistant:** {chat['content']}")
-        if st.session_state.processed:
             user_input = st.chat_input("Ask about the LinkedIn data...")
             if user_input:
                 st.session_state.chat_history.append({"role": "user", "content": user_input})
-                with st.spinner("🤔 Analyzing..."):
-                    try:
-                        if st.session_state.conversation:
                             response = st.session_state.conversation.invoke({"question": user_input})
-                            answer = response.get("answer", "No response generated.")
                             st.session_state.chat_history.append({"role": "assistant", "content": answer})
-                            st.rerun()
-                    except Exception as e:
-                        st.session_state.chat_history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
-                        st.rerun()
         else:
-            st.info("👋 Enter a LinkedIn URL and click 'Extract & Analyze' to start")
     with col2:
         if st.session_state.processed:
-            st.markdown("### 📊 Overview")
             data = st.session_state.extracted_data
             chunks = get_text_chunks(data)
             st.metric("Content Type", data_type.title())
-            st.metric("Text Chunks", len(chunks))
-            st.metric("Characters", f"{len(data):,}")
 if __name__ == "__main__":
     main()

+# pages/linkedin_extractor.py
 import streamlit as st
 import requests
 from bs4 import BeautifulSoup
 from langchain_text_splitters import CharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from langchain_core.documents import Document
 )
 def get_embeddings():
+    """Initialize HuggingFace embeddings with fallback"""
     try:
+        embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2"
+        )
         return embeddings
     except Exception as e:
         st.error(f"❌ Failed to load embeddings: {e}")
+        st.info("🔧 Please make sure 'sentence-transformers' is in requirements.txt")
         return None
 def get_llm():
+    """Initialize HuggingFace LLM"""
     try:
         api_key = os.getenv('HUGGINGFACEHUB_API_TOKEN')
         if not api_key:
+            st.error("""
+            ❌ HuggingFace API Key not found!
+            Please add your API key:
+            1. Go to Space Settings → Variables and Secrets
+            2. Add: HUGGINGFACEHUB_API_TOKEN = "your_hf_token_here"
+            3. Restart the Space
+            """)
             return None
         llm = HuggingFaceHub(
             repo_id="google/flan-t5-large",
             huggingfacehub_api_token=api_key,
+            model_kwargs={
+                "temperature": 0.7,
+                "max_length": 512,
+                "max_new_tokens": 256
+            }
         )
         return llm
     except Exception as e:
         return None
 def extract_linkedin_data(url, data_type):
+    """Extract data from LinkedIn URLs"""
     try:
         headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
         }
+        st.info(f"🌐 Accessing: {url}")
+        response = requests.get(url, headers=headers, timeout=20)
         if response.status_code != 200:
             return f"❌ Failed to access page (Status: {response.status_code})"
         soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove scripts and styles
+        for script in soup(["script", "style", "meta", "link"]):
             script.decompose()
+        # Extract and clean text
         text = soup.get_text()
         lines = (line.strip() for line in text.splitlines())
         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        clean_text = ' '.join(chunk for chunk in chunks if chunk)
+        # Extract meaningful content
+        paragraphs = [p.strip() for p in clean_text.split('.') if len(p.strip()) > 30]
+        if not paragraphs:
+            return "❌ No meaningful content found. The page might require login."
+        # Structure the result
+        result = f"🔗 LINKEDIN DATA EXTRACTION\n"
+        result += "=" * 60 + "\n\n"
+        result += f"📄 URL: {url}\n"
+        result += f"📊 Type: {data_type.upper()}\n"
+        result += f"⏰ Extracted: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"
+        result += f"📝 Content Blocks: {len(paragraphs)}\n"
+        result += "=" * 60 + "\n\n"
+        # Add extracted content
+        for i, content in enumerate(paragraphs[:15], 1):
+            result += f"📄 Block {i}:\n"
+            result += f"{content}\n"
+            result += "-" * 40 + "\n\n"
+        result += "=" * 60 + "\n"
+        result += f"✅ Successfully extracted {len(paragraphs)} content blocks\n"
+        result += f"📊 Total characters: {len(clean_text):,}\n"
         return result
+    except requests.exceptions.Timeout:
+        return "❌ Error: Request timed out. Please try again."
+    except requests.exceptions.ConnectionError:
+        return "❌ Error: Connection failed. Please check the URL."
     except Exception as e:
         return f"❌ Error: {str(e)}"
 def get_text_chunks(text):
+    """Split text into chunks"""
     if not text.strip():
         return []
+    splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=800,
+        chunk_overlap=150,
+        length_function=len
+    )
     return splitter.split_text(text)
 def get_vectorstore(text_chunks):
+    """Create vector store from text chunks"""
     if not text_chunks:
         return None
+    try:
+        documents = [Document(page_content=chunk) for chunk in text_chunks]
+        embeddings = get_embeddings()
+        if embeddings is None:
+            return None
+        vectorstore = FAISS.from_documents(documents, embeddings)
+        return vectorstore
+    except Exception as e:
+        st.error(f"❌ Vector store creation failed: {e}")
         return None
 def get_conversation_chain(vectorstore):
+    """Create conversational chain"""
     if vectorstore is None:
         return None
     try:
         llm = get_llm()
         if llm is None:
             return None
+        memory = ConversationBufferMemory(
+            memory_key="chat_history",
+            return_messages=True,
+            output_key="answer"
+        )
         chain = ConversationalRetrievalChain.from_llm(
             llm=llm,
             retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
             memory=memory,
+            return_source_documents=True,
+            output_key="answer"
         )
         return chain
     except Exception as e:
+        st.error(f"❌ Conversation chain error: {e}")
         return None
+def clear_chat_history():
+    """Clear chat history while keeping extracted data"""
+    if "vectorstore" in st.session_state and st.session_state.vectorstore:
+        st.session_state.chatbot = get_conversation_chain(st.session_state.vectorstore)
+        st.session_state.chat_history = []
+        st.success("🔄 Chat history cleared! Starting fresh conversation.")
 def main():
     st.title("💼 LinkedIn AI Analyzer")
     if st.button("← Back to Main Dashboard"):
         st.switch_page("app.py")
+    # Check API key
+    if not os.getenv('HUGGINGFACEHUB_API_TOKEN'):
+        st.error("""
+        🔑 **HuggingFace API Key Required**
+        To enable AI features:
+        1. Go to **Space Settings** → **Variables and Secrets**
+        2. Add: `HUGGINGFACEHUB_API_TOKEN = "your_hf_token_here"`
+        3. **Restart** the Space
+        Get free API key from: https://huggingface.co/settings/tokens
+        """)
     # Initialize session state
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
         st.session_state.processed = False
     if "extracted_data" not in st.session_state:
         st.session_state.extracted_data = ""
+    if "vectorstore" not in st.session_state:
+        st.session_state.vectorstore = None
+    if "current_url" not in st.session_state:
+        st.session_state.current_url = ""
     # Sidebar
     with st.sidebar:
+        st.markdown("### ⚙️ Configuration")
+        # Data type selection
+        data_type = st.selectbox(
+            "📊 Content Type",
+            ["profile", "company", "post"],
+            help="Select the type of LinkedIn content"
+        )
+        # URL input with examples
         url_placeholder = {
             "profile": "https://www.linkedin.com/in/username/",
             "company": "https://www.linkedin.com/company/companyname/",
             "post": "https://www.linkedin.com/posts/username_postid/"
         }
+        linkedin_url = st.text_input(
+            "🌐 LinkedIn URL",
+            placeholder=url_placeholder[data_type],
+            help="Enter a public LinkedIn URL"
+        )
+        # Suggested URLs
+        st.markdown("### 💡 Try These:")
+        suggested_urls = {
+            "Microsoft": "https://www.linkedin.com/company/microsoft/",
+            "Google": "https://www.linkedin.com/company/google/",
+            "Apple": "https://www.linkedin.com/company/apple/"
+        }
+        for name, url in suggested_urls.items():
+            if st.button(f"🏢 {name}", key=name, use_container_width=True):
+                st.session_state.current_url = url
+                st.rerun()
+        # Extract button
+        col1, col2 = st.columns(2)
+        with col1:
+            if st.button("🚀 Extract & Analyze", type="primary", use_container_width=True):
+                url_to_use = linkedin_url.strip() or st.session_state.current_url
+                if not url_to_use:
+                    st.warning("⚠️ Please enter a LinkedIn URL")
+                elif not url_to_use.startswith('https://www.linkedin.com/'):
+                    st.error("❌ Please enter a valid LinkedIn URL")
+                else:
+                    with st.spinner("🔄 Extracting data from LinkedIn..."):
+                        extracted_data = extract_linkedin_data(url_to_use, data_type)
+                        if extracted_data and not extracted_data.startswith("❌"):
+                            # Process for AI
+                            chunks = get_text_chunks(extracted_data)
+                            if chunks:
+                                vectorstore = get_vectorstore(chunks)
+                                conversation = get_conversation_chain(vectorstore)
+                                if conversation:
+                                    st.session_state.conversation = conversation
+                                    st.session_state.vectorstore = vectorstore
+                                    st.session_state.processed = True
+                                    st.session_state.extracted_data = extracted_data
+                                    st.session_state.chat_history = []
+                                    st.session_state.current_url = url_to_use
+                                    st.success(f"✅ Ready to analyze {len(chunks)} content chunks!")
+                                else:
+                                    st.error("❌ Failed to initialize AI")
                             else:
+                                st.error("❌ No content extracted")
                         else:
+                            st.error(extracted_data)
+        with col2:
+            if st.session_state.processed:
+                if st.button("🗑️ Clear Chat", type="secondary", use_container_width=True):
+                    clear_chat_history()
+        # Display extraction info
+        if st.session_state.processed:
+            st.markdown("---")
+            st.markdown("### 📊 Extraction Info")
+            st.write(f"**Type:** {data_type.title()}")
+            st.write(f"**URL:** {st.session_state.current_url[:50]}...")
+            if st.session_state.extracted_data:
+                chunks = get_text_chunks(st.session_state.extracted_data)
+                st.write(f"**Chunks:** {len(chunks)}")
+                st.write(f"**Characters:** {len(st.session_state.extracted_data):,}")
+    # Main content area
     col1, col2 = st.columns([2, 1])
     with col1:
+        st.markdown("### 💬 AI Conversation")
+        # Display chat history
         for i, chat in enumerate(st.session_state.chat_history):
             if chat["role"] == "user":
+                with st.chat_message("user"):
+                    st.write(chat["content"])
             elif chat["role"] == "assistant":
+                with st.chat_message("assistant"):
+                    st.write(chat["content"])
+        # Chat input
+        if st.session_state.processed and st.session_state.conversation:
             user_input = st.chat_input("Ask about the LinkedIn data...")
             if user_input:
+                # Add user message
                 st.session_state.chat_history.append({"role": "user", "content": user_input})
+                with st.chat_message("user"):
+                    st.write(user_input)
+                # Generate AI response
+                with st.chat_message("assistant"):
+                    with st.spinner("🤔 Analyzing..."):
+                        try:
                             response = st.session_state.conversation.invoke({"question": user_input})
+                            answer = response.get("answer", "I couldn't generate a response based on the available data.")
+                            st.write(answer)
                             st.session_state.chat_history.append({"role": "assistant", "content": answer})
+                        except Exception as e:
+                            error_msg = f"❌ Error generating response: {str(e)}"
+                            st.write(error_msg)
+                            st.session_state.chat_history.append({"role": "assistant", "content": error_msg})
+        elif st.session_state.processed:
+            st.info("💬 Extract data first to start chatting with AI")
         else:
+            st.info("""
+            👋 **Welcome to LinkedIn AI Analyzer!**
+            **To get started:**
+            1. Select content type in sidebar
+            2. Enter a LinkedIn URL or click a suggested company
+            3. Click "Extract & Analyze"
+            4. Chat with AI about the extracted content
+            **Supported URLs:**
+            - 👤 Profiles: `https://www.linkedin.com/in/username/`
+            - 🏢 Companies: `https://www.linkedin.com/company/companyname/`
+            - 📝 Posts: `https://www.linkedin.com/posts/username_postid/`
+            **Note:** Only public profiles and content are accessible.
+            """)
     with col2:
+        st.markdown("### 📈 Analytics")
         if st.session_state.processed:
             data = st.session_state.extracted_data
             chunks = get_text_chunks(data)
             st.metric("Content Type", data_type.title())
+            st.metric("Content Chunks", len(chunks))
+            st.metric("Total Characters", f"{len(data):,}")
+            st.metric("Conversation Turns", len(st.session_state.chat_history) // 2)
+            # Suggested questions
+            if not st.session_state.chat_history:
+                st.markdown("### 💡 Suggested Questions")
+                suggestions = [
+                    "Summarize the main information",
+                    "What are the key skills or experiences mentioned?",
+                    "Tell me about the company overview",
+                    "What's the main content of this page?",
+                    "Extract important achievements"
+                ]
+                for suggestion in suggestions:
+                    if st.button(suggestion, key=f"suggest_{suggestion}", use_container_width=True):
+                        st.info(f"💡 Try asking: '{suggestion}'")
+        else:
+            st.info("📊 Analytics will appear here after data extraction")
 if __name__ == "__main__":
     main()