Spaces:

MBilal-72
/

GenerativeEngineOptimization

Runtime error

App Files Files Community

MBilal-72 commited on Jul 25, 2025

Commit

b47cd08

verified ·

1 Parent(s): dc3f770

update app.py with link analyzer

Browse files

Files changed (1) hide show

app.py +341 -59

app.py CHANGED Viewed

@@ -2,6 +2,12 @@ import os
 import tempfile
 import streamlit as st
 import json
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.vectorstores import FAISS
@@ -18,7 +24,7 @@ HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "your-huggingface-api-key
 # --- Initialize Groq LLM ---
 llm = ChatGroq(
     api_key=GROQ_API_KEY,
-    model_name="llama3-8b-8192",  # Note: it's `model_name` not `model`
     temperature=0.1
 )
@@ -26,7 +32,6 @@ llm = ChatGroq(
 embedding = HuggingFaceEmbeddings(
     model_name="sentence-transformers/all-MiniLM-L6-v2",
     cache_folder="./hf_cache",
-    # huggingfacehub_api_token=HUGGINGFACE_API_KEY
 )
 # --- System Prompt for Content Enhancement ---
@@ -64,6 +69,140 @@ Present your analysis and optimized text in the following JSON format:
 }
 ```"""
 # --- Create Chat Prompt Template for Content Enhancement ---
 enhancement_prompt = ChatPromptTemplate.from_messages([
     ("system", system_prompt),
@@ -71,29 +210,25 @@ enhancement_prompt = ChatPromptTemplate.from_messages([
 ])
 # --- Streamlit UI ---
-st.title("📄📥 Chat with PDF or Text using Groq + RAG")
-st.sidebar.title("Features")
-st.sidebar.markdown("- Upload PDF files")
-st.sidebar.markdown("- Paste raw text")
-st.sidebar.markdown("- Content enhancement analysis")
-st.sidebar.markdown("- Question answering with RAG")
-# Create tabs for different functionalities
-tab1, tab2 = st.tabs(["📄 Document Chat", "🔧 Content Enhancement"])
 with tab1:
     st.header("Document Question Answering")
-    # Option to upload PDF
     uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
-    # Option to paste raw text
     pasted_text = st.text_area("Or paste some text below:", height=150)
-    # User's question
     user_query = st.text_input("Ask a question about the content")
-    # Submit button for QA
     submit_qa_button = st.button("Submit Question", key="qa_submit")
     if submit_qa_button:
@@ -103,7 +238,6 @@ with tab1:
         documents = []
-        # Handle uploaded PDF
         if uploaded_file:
             with st.spinner("Processing PDF..."):
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
@@ -112,24 +246,18 @@ with tab1:
                 loader = PyPDFLoader(tmp_path)
                 documents = loader.load_and_split()
-                # Clean up temporary file
                 os.unlink(tmp_path)
-        # Handle pasted text if no PDF
         elif pasted_text.strip():
             documents = [Document(page_content=pasted_text)]
         else:
             st.warning("Please upload a PDF or paste some text.")
             st.stop()
-        # Create vector store
         with st.spinner("Creating embeddings..."):
             vectorstore = FAISS.from_documents(documents, embedding)
             retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
-        # Custom prompt for QA
         qa_prompt_template = PromptTemplate(
             input_variables=["context", "question"],
             template="""You are an AI assistant. Use the following context to answer the question.
@@ -140,7 +268,6 @@ with tab1:
             Answer:"""
         )
-        # QA Chain
         qa_chain = RetrievalQA.from_chain_type(
             llm=llm,
             chain_type="stuff",
@@ -149,16 +276,12 @@ with tab1:
             chain_type_kwargs={"prompt": qa_prompt_template}
         )
-        # Run QA
         with st.spinner("Generating answer..."):
             try:
                 result = qa_chain({"query": user_query})
-                # Show result
                 st.markdown("### 💬 Answer")
                 st.write(result["result"])
-                # Show sources
                 with st.expander("📄 Source Documents"):
                     for i, doc in enumerate(result["source_documents"]):
                         st.write(f"**Source {i+1}:**")
@@ -172,12 +295,7 @@ with tab1:
 with tab2:
     st.header("Content Enhancement Analysis")
-    st.markdown("Analyze and optimize your content for better LLM performance.")
-    # Text input for enhancement
     enhancement_text = st.text_area("Enter text to analyze and enhance:", height=200, key="enhancement_input")
-    # Submit button for enhancement
     submit_enhancement_button = st.button("Analyze & Enhance", key="enhancement_submit")
     if submit_enhancement_button:
@@ -187,20 +305,13 @@ with tab2:
         with st.spinner("Analyzing content..."):
             try:
-                # Create the enhancement chain
                 enhancement_chain = enhancement_prompt | llm
-                # Run enhancement analysis
                 result = enhancement_chain.invoke({"input": enhancement_text})
-                # Parse the result
                 result_content = result.content if hasattr(result, 'content') else str(result)
                 st.markdown("### 📊 Analysis Results")
-                # Try to extract JSON from the response
                 try:
-                    # Find JSON in the response
                     json_start = result_content.find('{')
                     json_end = result_content.rfind('}') + 1
@@ -208,7 +319,6 @@ with tab2:
                         json_str = result_content[json_start:json_end]
                         analysis_data = json.loads(json_str)
-                        # Display scores
                         st.markdown("#### Scores (1-10)")
                         col1, col2, col3 = st.columns(3)
@@ -224,50 +334,222 @@ with tab2:
                             answer_score = analysis_data.get('score', {}).get('answerability', 'N/A')
                             st.metric("Answerability", answer_score)
-                        # Display keywords
                         keywords = analysis_data.get('keywords', [])
                         if keywords:
                             st.markdown("#### 🔑 Key Terms")
                             st.write(", ".join(keywords))
-                        # Display optimized text
                         optimized_text = analysis_data.get('optimized_text', '')
                         if optimized_text:
                             st.markdown("#### ✨ Optimized Content")
                             st.text_area("Enhanced version:", value=optimized_text, height=200, key="optimized_output")
-                            # Option to copy optimized text
-                            if st.button("📋 Copy Optimized Text"):
-                                st.success("Text copied to clipboard! (Note: Manual copy from text area above)")
                     else:
-                        # Fallback: display raw response
                         st.markdown("#### Analysis Response")
                         st.write(result_content)
                 except json.JSONDecodeError:
-                    # Fallback: display raw response
                     st.markdown("#### Analysis Response")
                     st.write(result_content)
             except Exception as e:
                 st.error(f"An error occurred during enhancement: {str(e)}")
 # --- Sidebar Information ---
 with st.sidebar:
     st.markdown("---")
     st.markdown("### 🔧 Configuration")
-    st.markdown("Make sure to set your API keys:")
     st.code("export GROQ_API_KEY='your-key'")
-    st.code("export HUGGINGFACE_API_KEY='your-key'")
     st.markdown("---")
     st.markdown("### ℹ️ About")
-    st.markdown("This app combines:")
-    st.markdown("- **Groq LLM** for fast inference")
-    st.markdown("- **FAISS** for vector search")
-    st.markdown("- **HuggingFace** embeddings")
-    st.markdown("- **RAG** for accurate answers")
-# --- Footer ---
 st.markdown("---")
-st.markdown("*Built with Streamlit, LangChain, and Groq*")

 import tempfile
 import streamlit as st
 import json
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+import time
+from typing import List, Dict, Any
+import pandas as pd
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.vectorstores import FAISS
 # --- Initialize Groq LLM ---
 llm = ChatGroq(
     api_key=GROQ_API_KEY,
+    model_name="llama3-8b-8192",
     temperature=0.1
 )
 embedding = HuggingFaceEmbeddings(
     model_name="sentence-transformers/all-MiniLM-L6-v2",
     cache_folder="./hf_cache",
 )
 # --- System Prompt for Content Enhancement ---
 }
 ```"""
+# --- GEO Analysis System Prompt ---
+geo_analysis_prompt = """You are a Generative Engine Optimizer (GEO) specialist. Analyze the provided website content for its effectiveness in AI-powered search engines and LLM systems.
+Evaluate the content based on these GEO criteria (score 1-10 each):
+1. **AI Search Visibility**: How likely is this content to be surfaced by AI search engines?
+2. **Query Intent Matching**: How well does the content match common user queries?
+3. **Factual Accuracy & Authority**: How trustworthy and authoritative is the information?
+4. **Conversational Readiness**: How suitable is the content for AI chat responses?
+5. **Semantic Richness**: How well does the content use relevant semantic keywords?
+6. **Context Completeness**: Does the content provide complete, self-contained answers?
+7. **Citation Worthiness**: How likely are AI systems to cite this content?
+8. **Multi-Query Coverage**: Does the content answer multiple related questions?
+Also identify:
+- Primary topics and entities
+- Missing information gaps
+- Optimization opportunities
+- Specific enhancement recommendations
+Format your response as JSON:
+```json
+{
+  "geo_scores": {
+    "ai_search_visibility": 7.5,
+    "query_intent_matching": 8.0,
+    "factual_accuracy": 9.0,
+    "conversational_readiness": 6.5,
+    "semantic_richness": 7.0,
+    "context_completeness": 8.5,
+    "citation_worthiness": 7.8,
+    "multi_query_coverage": 6.0
+  },
+  "overall_geo_score": 7.5,
+  "primary_topics": ["topic1", "topic2"],
+  "entities": ["entity1", "entity2"],
+  "missing_gaps": ["gap1", "gap2"],
+  "optimization_opportunities": [
+    {
+      "type": "semantic_enhancement",
+      "description": "Add more related terms",
+      "priority": "high"
+    }
+  ],
+  "recommendations": [
+    "Specific actionable recommendation 1",
+    "Specific actionable recommendation 2"
+  ]
+}
+```"""
+# --- Website Scraping Functions ---
+def extract_website_content(url: str, max_pages: int = 5) -> List[Dict[str, Any]]:
+    """Extract content from website pages"""
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Remove script and style elements
+        for script in soup(["script", "style", "nav", "footer", "header"]):
+            script.decompose()
+        # Extract main content
+        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') or soup.body
+        if main_content:
+            text_content = main_content.get_text(separator=' ', strip=True)
+        else:
+            text_content = soup.get_text(separator=' ', strip=True)
+        # Clean up text
+        lines = [line.strip() for line in text_content.split('\n') if line.strip()]
+        cleaned_text = ' '.join(lines)
+        # Extract metadata
+        title = soup.find('title').get_text() if soup.find('title') else "No Title"
+        meta_desc = soup.find('meta', attrs={'name': 'description'})
+        description = meta_desc.get('content') if meta_desc else "No Description"
+        # Extract headings
+        headings = []
+        for i in range(1, 7):
+            for heading in soup.find_all(f'h{i}'):
+                headings.append({
+                    'level': i,
+                    'text': heading.get_text(strip=True)
+                })
+        return [{
+            'url': url,
+            'title': title,
+            'description': description,
+            'content': cleaned_text[:10000],  # Limit content length
+            'headings': headings,
+            'word_count': len(cleaned_text.split())
+        }]
+    except Exception as e:
+        st.error(f"Error scraping {url}: {str(e)}")
+        return []
+def analyze_page_geo_score(content: str, title: str, llm) -> Dict[str, Any]:
+    """Analyze a single page for GEO score"""
+    try:
+        geo_prompt = ChatPromptTemplate.from_messages([
+            ("system", geo_analysis_prompt),
+            ("user", f"Title: {title}\n\nContent: {content}")
+        ])
+        chain = geo_prompt | llm
+        result = chain.invoke({"input": f"Title: {title}\n\nContent: {content}"})
+        result_content = result.content if hasattr(result, 'content') else str(result)
+        # Extract JSON from response
+        json_start = result_content.find('{')
+        json_end = result_content.rfind('}') + 1
+        if json_start != -1 and json_end != -1:
+            json_str = result_content[json_start:json_end]
+            return json.loads(json_str)
+        else:
+            return {"error": "Could not parse GEO analysis"}
+    except Exception as e:
+        return {"error": f"Analysis failed: {str(e)}"}
 # --- Create Chat Prompt Template for Content Enhancement ---
 enhancement_prompt = ChatPromptTemplate.from_messages([
     ("system", system_prompt),
 ])
 # --- Streamlit UI ---
+st.set_page_config(page_title="AI Content Optimizer", page_icon="🚀", layout="wide")
+st.title("🚀 AI Content Optimizer & GEO Analyzer")
+# Sidebar
+st.sidebar.title("🛠️ Tools")
+st.sidebar.markdown("- 📄 Document Q&A")
+st.sidebar.markdown("- 🔧 Content Enhancement")
+st.sidebar.markdown("- 🌐 Website GEO Analysis")
+st.sidebar.markdown("- 📊 SEO-like Scoring")
+# Create tabs
+tab1, tab2, tab3 = st.tabs(["📄 Document Chat", "🔧 Content Enhancement", "🌐 Website GEO Analysis"])
 with tab1:
     st.header("Document Question Answering")
     uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
     pasted_text = st.text_area("Or paste some text below:", height=150)
     user_query = st.text_input("Ask a question about the content")
     submit_qa_button = st.button("Submit Question", key="qa_submit")
     if submit_qa_button:
         documents = []
         if uploaded_file:
             with st.spinner("Processing PDF..."):
                 with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
                 loader = PyPDFLoader(tmp_path)
                 documents = loader.load_and_split()
                 os.unlink(tmp_path)
         elif pasted_text.strip():
             documents = [Document(page_content=pasted_text)]
         else:
             st.warning("Please upload a PDF or paste some text.")
             st.stop()
         with st.spinner("Creating embeddings..."):
             vectorstore = FAISS.from_documents(documents, embedding)
             retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
         qa_prompt_template = PromptTemplate(
             input_variables=["context", "question"],
             template="""You are an AI assistant. Use the following context to answer the question.
             Answer:"""
         )
         qa_chain = RetrievalQA.from_chain_type(
             llm=llm,
             chain_type="stuff",
             chain_type_kwargs={"prompt": qa_prompt_template}
         )
         with st.spinner("Generating answer..."):
             try:
                 result = qa_chain({"query": user_query})
                 st.markdown("### 💬 Answer")
                 st.write(result["result"])
                 with st.expander("📄 Source Documents"):
                     for i, doc in enumerate(result["source_documents"]):
                         st.write(f"**Source {i+1}:**")
 with tab2:
     st.header("Content Enhancement Analysis")
     enhancement_text = st.text_area("Enter text to analyze and enhance:", height=200, key="enhancement_input")
     submit_enhancement_button = st.button("Analyze & Enhance", key="enhancement_submit")
     if submit_enhancement_button:
         with st.spinner("Analyzing content..."):
             try:
                 enhancement_chain = enhancement_prompt | llm
                 result = enhancement_chain.invoke({"input": enhancement_text})
                 result_content = result.content if hasattr(result, 'content') else str(result)
                 st.markdown("### 📊 Analysis Results")
                 try:
                     json_start = result_content.find('{')
                     json_end = result_content.rfind('}') + 1
                         json_str = result_content[json_start:json_end]
                         analysis_data = json.loads(json_str)
                         st.markdown("#### Scores (1-10)")
                         col1, col2, col3 = st.columns(3)
                             answer_score = analysis_data.get('score', {}).get('answerability', 'N/A')
                             st.metric("Answerability", answer_score)
                         keywords = analysis_data.get('keywords', [])
                         if keywords:
                             st.markdown("#### 🔑 Key Terms")
                             st.write(", ".join(keywords))
                         optimized_text = analysis_data.get('optimized_text', '')
                         if optimized_text:
                             st.markdown("#### ✨ Optimized Content")
                             st.text_area("Enhanced version:", value=optimized_text, height=200, key="optimized_output")
                     else:
                         st.markdown("#### Analysis Response")
                         st.write(result_content)
                 except json.JSONDecodeError:
                     st.markdown("#### Analysis Response")
                     st.write(result_content)
             except Exception as e:
                 st.error(f"An error occurred during enhancement: {str(e)}")
+with tab3:
+    st.header("🌐 Website GEO Analysis")
+    st.markdown("Analyze any website for Generative Engine Optimization (GEO) - how well it performs with AI search engines.")
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        website_url = st.text_input("Enter website URL:", placeholder="https://example.com")
+    with col2:
+        max_pages = st.selectbox("Pages to analyze:", [1, 3, 5], index=0)
+    analyze_website_button = st.button("🔍 Analyze Website", key="website_analyze")
+    if analyze_website_button:
+        if not website_url.strip():
+            st.warning("Please enter a website URL.")
+            st.stop()
+        # Add https:// if not present
+        if not website_url.startswith(('http://', 'https://')):
+            website_url = 'https://' + website_url
+        with st.spinner(f"Analyzing website: {website_url}"):
+            try:
+                # Extract website content
+                pages_data = extract_website_content(website_url, max_pages)
+                if not pages_data:
+                    st.error("Could not extract content from the website.")
+                    st.stop()
+                st.success(f"Successfully extracted content from {len(pages_data)} page(s)")
+                # Analyze each page
+                all_analyses = []
+                for i, page_data in enumerate(pages_data):
+                    with st.spinner(f"Analyzing page {i+1}/{len(pages_data)}..."):
+                        analysis = analyze_page_geo_score(
+                            page_data['content'],
+                            page_data['title'],
+                            llm
+                        )
+                        if 'error' not in analysis:
+                            analysis['page_data'] = page_data
+                            all_analyses.append(analysis)
+                        else:
+                            st.warning(f"Could not analyze page {i+1}: {analysis['error']}")
+                if all_analyses:
+                    # Display overall results
+                    st.markdown("## 📊 GEO Analysis Results")
+                    # Calculate average scores
+                    avg_scores = {}
+                    score_keys = list(all_analyses[0].get('geo_scores', {}).keys())
+                    for key in score_keys:
+                        scores = [analysis['geo_scores'][key] for analysis in all_analyses if 'geo_scores' in analysis]
+                        avg_scores[key] = sum(scores) / len(scores) if scores else 0
+                    overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
+                    # Display metrics
+                    st.markdown("### 🎯 Overall GEO Scores")
+                    # Main score
+                    col1, col2, col3 = st.columns([1, 2, 1])
+                    with col2:
+                        st.metric("Overall GEO Score", f"{overall_avg:.1f}/10",
+                                 delta=f"{overall_avg - 7.0:.1f}" if overall_avg >= 7.0 else f"{overall_avg - 7.0:.1f}")
+                    # Individual scores
+                    st.markdown("### 📈 Detailed Metrics")
+                    col1, col2, col3, col4 = st.columns(4)
+                    metrics_display = [
+                        ("AI Search Visibility", "ai_search_visibility"),
+                        ("Query Intent Match", "query_intent_matching"),
+                        ("Factual Accuracy", "factual_accuracy"),
+                        ("Conversational Ready", "conversational_readiness")
+                    ]
+                    for i, (display_name, key) in enumerate(metrics_display):
+                        with [col1, col2, col3, col4][i]:
+                            score = avg_scores.get(key, 0)
+                            st.metric(display_name, f"{score:.1f}")
+                    col1, col2, col3, col4 = st.columns(4)
+                    metrics_display_2 = [
+                        ("Semantic Richness", "semantic_richness"),
+                        ("Context Complete", "context_completeness"),
+                        ("Citation Worthy", "citation_worthiness"),
+                        ("Multi-Query Cover", "multi_query_coverage")
+                    ]
+                    for i, (display_name, key) in enumerate(metrics_display_2):
+                        with [col1, col2, col3, col4][i]:
+                            score = avg_scores.get(key, 0)
+                            st.metric(display_name, f"{score:.1f}")
+                    # Recommendations
+                    st.markdown("### 💡 Optimization Recommendations")
+                    all_recommendations = []
+                    all_opportunities = []
+                    for analysis in all_analyses:
+                        all_recommendations.extend(analysis.get('recommendations', []))
+                        all_opportunities.extend(analysis.get('optimization_opportunities', []))
+                    # Remove duplicates
+                    unique_recommendations = list(set(all_recommendations))
+                    for i, rec in enumerate(unique_recommendations[:5], 1):
+                        st.write(f"**{i}.** {rec}")
+                    # Opportunities by priority
+                    if all_opportunities:
+                        st.markdown("### 🚀 Priority Optimizations")
+                        high_priority = [opp for opp in all_opportunities if opp.get('priority') == 'high']
+                        medium_priority = [opp for opp in all_opportunities if opp.get('priority') == 'medium']
+                        if high_priority:
+                            st.markdown("#### 🔴 High Priority")
+                            for opp in high_priority[:3]:
+                                st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', 'No description')}")
+                        if medium_priority:
+                            st.markdown("#### 🟡 Medium Priority")
+                            for opp in medium_priority[:3]:
+                                st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', 'No description')}")
+                    # Detailed page analysis
+                    with st.expander("📋 Detailed Page Analysis"):
+                        for i, analysis in enumerate(all_analyses):
+                            page_data = analysis.get('page_data', {})
+                            st.markdown(f"#### Page {i+1}: {page_data.get('title', 'Unknown Title')}")
+                            st.write(f"**URL**: {page_data.get('url', 'Unknown')}")
+                            st.write(f"**Word Count**: {page_data.get('word_count', 0)}")
+                            if 'primary_topics' in analysis:
+                                st.write(f"**Topics**: {', '.join(analysis['primary_topics'])}")
+                            if 'entities' in analysis:
+                                st.write(f"**Entities**: {', '.join(analysis['entities'])}")
+                            st.write("---")
+                    # Export functionality
+                    st.markdown("### 📥 Export Results")
+                    if st.button("📊 Generate Report"):
+                        report_data = {
+                            'website_url': website_url,
+                            'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S'),
+                            'overall_score': overall_avg,
+                            'individual_scores': avg_scores,
+                            'recommendations': unique_recommendations,
+                            'pages_analyzed': len(all_analyses)
+                        }
+                        st.json(report_data)
+                        st.success("Report generated! You can copy the JSON above for your records.")
+                else:
+                    st.error("Could not analyze any pages from the website.")
+            except Exception as e:
+                st.error(f"An error occurred during website analysis: {str(e)}")
 # --- Sidebar Information ---
 with st.sidebar:
     st.markdown("---")
     st.markdown("### 🔧 Configuration")
+    st.markdown("Set your API keys:")
     st.code("export GROQ_API_KEY='your-key'")
+    st.markdown("---")
+    st.markdown("### 📖 GEO Metrics Explained")
+    st.markdown("**AI Search Visibility**: Likelihood of appearing in AI search results")
+    st.markdown("**Query Intent Matching**: How well content matches user queries")
+    st.markdown("**Conversational Readiness**: Suitability for AI chat responses")
+    st.markdown("**Citation Worthiness**: Probability of being cited by AI")
     st.markdown("---")
     st.markdown("### ℹ️ About")
+    st.markdown("This tool analyzes websites for:")
+    st.markdown("- 🤖 AI search optimization")
+    st.markdown("- 💬 LLM compatibility")
+    st.markdown("- 📊 GEO scoring")
+    st.markdown("- 🎯 Content recommendations")
 st.markdown("---")
+st.markdown("*🚀 AI Content Optimizer - Built with Streamlit, LangChain, and Groq*")