Spaces:
Runtime error
Runtime error
| import os | |
| import tempfile | |
| import streamlit as st | |
| import json | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| import time | |
| from typing import List, Dict, Any | |
| import pandas as pd | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain.chains import RetrievalQA | |
| from langchain.prompts import PromptTemplate, ChatPromptTemplate | |
| from langchain.schema import Document | |
| from langchain_groq import ChatGroq | |
| # --- Environment Variables --- | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY", "your-groq-api-key") | |
| HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "your-huggingface-api-key") | |
| # --- Initialize Groq LLM --- | |
| llm = ChatGroq( | |
| api_key=GROQ_API_KEY, | |
| model_name="llama3-8b-8192", | |
| temperature=0.1 | |
| ) | |
| # --- HuggingFace Embeddings --- | |
| embedding = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/all-MiniLM-L6-v2", | |
| cache_folder="./hf_cache", | |
| ) | |
| # --- System Prompt for Content Enhancement --- | |
| system_prompt = """You are an AI Content Enhancement Specialist. Your purpose is to optimize user-provided text to maximize its effectiveness for large language models (LLMs) in search, question-answering, and conversational AI systems. | |
| Evaluate the input text based on the following criteria, assigning a score from 1β10 for each: | |
| Clarity: How easily can the content be understood? | |
| Structuredness: How well-organized and coherent is the content? | |
| LLM Answerability: How easily can an LLM extract precise answers from the content? | |
| Identify the most salient keywords. | |
| Rewrite the text to improve: | |
| Clarity and precision | |
| Logical structure and flow | |
| Suitability for LLM-based information retrieval | |
| Present your analysis and optimized text in the following JSON format: | |
| ```json | |
| { | |
| "score": { | |
| "clarity": 8.5, | |
| "structuredness": 7.0, | |
| "answerability": 9.0 | |
| }, | |
| "keywords": ["example", "installation", "setup"], | |
| "optimized_text": "..." | |
| } | |
| ```""" | |
| # --- GEO Analysis System Prompt --- | |
| geo_analysis_prompt = """You are a Generative Engine Optimizer (GEO) specialist. Analyze the provided website content for its effectiveness in AI-powered search engines and LLM systems. | |
| Evaluate the content based on these GEO criteria (score 1-10 each): | |
| 1. **AI Search Visibility**: How likely is this content to be surfaced by AI search engines? | |
| 2. **Query Intent Matching**: How well does the content match common user queries? | |
| 3. **Factual Accuracy & Authority**: How trustworthy and authoritative is the information? | |
| 4. **Conversational Readiness**: How suitable is the content for AI chat responses? | |
| 5. **Semantic Richness**: How well does the content use relevant semantic keywords? | |
| 6. **Context Completeness**: Does the content provide complete, self-contained answers? | |
| 7. **Citation Worthiness**: How likely are AI systems to cite this content? | |
| 8. **Multi-Query Coverage**: Does the content answer multiple related questions? | |
| Also identify: | |
| - Primary topics and entities | |
| - Missing information gaps | |
| - Optimization opportunities | |
| - Specific enhancement recommendations | |
| Format your response as JSON: | |
| ```json | |
| { | |
| "geo_scores": { | |
| "ai_search_visibility": 7.5, | |
| "query_intent_matching": 8.0, | |
| "factual_accuracy": 9.0, | |
| "conversational_readiness": 6.5, | |
| "semantic_richness": 7.0, | |
| "context_completeness": 8.5, | |
| "citation_worthiness": 7.8, | |
| "multi_query_coverage": 6.0 | |
| }, | |
| "overall_geo_score": 7.5, | |
| "primary_topics": ["topic1", "topic2"], | |
| "entities": ["entity1", "entity2"], | |
| "missing_gaps": ["gap1", "gap2"], | |
| "optimization_opportunities": [ | |
| { | |
| "type": "semantic_enhancement", | |
| "description": "Add more related terms", | |
| "priority": "high" | |
| } | |
| ], | |
| "recommendations": [ | |
| "Specific actionable recommendation 1", | |
| "Specific actionable recommendation 2" | |
| ] | |
| } | |
| ```""" | |
| # --- Website Scraping Functions --- | |
| def extract_website_content(url: str, max_pages: int = 5) -> List[Dict[str, Any]]: | |
| """Extract content from website pages""" | |
| try: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Remove script and style elements | |
| for script in soup(["script", "style", "nav", "footer", "header"]): | |
| script.decompose() | |
| # Extract main content | |
| main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') or soup.body | |
| if main_content: | |
| text_content = main_content.get_text(separator=' ', strip=True) | |
| else: | |
| text_content = soup.get_text(separator=' ', strip=True) | |
| # Clean up text | |
| lines = [line.strip() for line in text_content.split('\n') if line.strip()] | |
| cleaned_text = ' '.join(lines) | |
| # Extract metadata | |
| title = soup.find('title').get_text() if soup.find('title') else "No Title" | |
| meta_desc = soup.find('meta', attrs={'name': 'description'}) | |
| description = meta_desc.get('content') if meta_desc else "No Description" | |
| # Extract headings | |
| headings = [] | |
| for i in range(1, 7): | |
| for heading in soup.find_all(f'h{i}'): | |
| headings.append({ | |
| 'level': i, | |
| 'text': heading.get_text(strip=True) | |
| }) | |
| return [{ | |
| 'url': url, | |
| 'title': title, | |
| 'description': description, | |
| 'content': cleaned_text[:10000], # Limit content length | |
| 'headings': headings, | |
| 'word_count': len(cleaned_text.split()) | |
| }] | |
| except Exception as e: | |
| st.error(f"Error scraping {url}: {str(e)}") | |
| return [] | |
| def analyze_page_geo_score(content: str, title: str, llm) -> Dict[str, Any]: | |
| """Analyze a single page for GEO score""" | |
| try: | |
| geo_prompt = ChatPromptTemplate.from_messages([ | |
| ("system", geo_analysis_prompt), | |
| ("user", f"Title: {title}\n\nContent: {content}") | |
| ]) | |
| chain = geo_prompt | llm | |
| result = chain.invoke({"input": f"Title: {title}\n\nContent: {content}"}) | |
| result_content = result.content if hasattr(result, 'content') else str(result) | |
| # Extract JSON from response | |
| json_start = result_content.find('{') | |
| json_end = result_content.rfind('}') + 1 | |
| if json_start != -1 and json_end != -1: | |
| json_str = result_content[json_start:json_end] | |
| return json.loads(json_str) | |
| else: | |
| return {"error": "Could not parse GEO analysis"} | |
| except Exception as e: | |
| return {"error": f"Analysis failed: {str(e)}"} | |
| # --- Create Chat Prompt Template for Content Enhancement --- | |
| enhancement_prompt = ChatPromptTemplate.from_messages([ | |
| ("system", system_prompt), | |
| ("user", "{input}") | |
| ]) | |
| # --- Streamlit UI --- | |
| st.set_page_config(page_title="AI Content Optimizer", page_icon="π", layout="wide") | |
| st.title("π AI Content Optimizer & GEO Analyzer") | |
| # Sidebar | |
| st.sidebar.title("π οΈ Tools") | |
| st.sidebar.markdown("- π Document Q&A") | |
| st.sidebar.markdown("- π§ Content Enhancement") | |
| st.sidebar.markdown("- π Website GEO Analysis") | |
| st.sidebar.markdown("- π SEO-like Scoring") | |
| # Create tabs | |
| tab1, tab2, tab3 = st.tabs(["π Document Chat", "π§ Content Enhancement", "π Website GEO Analysis"]) | |
| with tab1: | |
| st.header("Document Question Answering") | |
| uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
| pasted_text = st.text_area("Or paste some text below:", height=150) | |
| user_query = st.text_input("Ask a question about the content") | |
| submit_qa_button = st.button("Submit Question", key="qa_submit") | |
| if submit_qa_button: | |
| if not user_query.strip(): | |
| st.warning("Please enter a question.") | |
| st.stop() | |
| documents = [] | |
| if uploaded_file: | |
| with st.spinner("Processing PDF..."): | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: | |
| tmp_file.write(uploaded_file.read()) | |
| tmp_path = tmp_file.name | |
| loader = PyPDFLoader(tmp_path) | |
| documents = loader.load_and_split() | |
| os.unlink(tmp_path) | |
| elif pasted_text.strip(): | |
| documents = [Document(page_content=pasted_text)] | |
| else: | |
| st.warning("Please upload a PDF or paste some text.") | |
| st.stop() | |
| with st.spinner("Creating embeddings..."): | |
| vectorstore = FAISS.from_documents(documents, embedding) | |
| retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) | |
| qa_prompt_template = PromptTemplate( | |
| input_variables=["context", "question"], | |
| template="""You are an AI assistant. Use the following context to answer the question. | |
| Be concise, accurate, and helpful. If the answer is not in the context, say so. | |
| Context: {context} | |
| Question: {question} | |
| Answer:""" | |
| ) | |
| qa_chain = RetrievalQA.from_chain_type( | |
| llm=llm, | |
| chain_type="stuff", | |
| retriever=retriever, | |
| return_source_documents=True, | |
| chain_type_kwargs={"prompt": qa_prompt_template} | |
| ) | |
| with st.spinner("Generating answer..."): | |
| try: | |
| result = qa_chain({"query": user_query}) | |
| st.markdown("### π¬ Answer") | |
| st.write(result["result"]) | |
| with st.expander("π Source Documents"): | |
| for i, doc in enumerate(result["source_documents"]): | |
| st.write(f"**Source {i+1}:**") | |
| st.write(doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content) | |
| if hasattr(doc, 'metadata') and doc.metadata: | |
| st.write(f"*Metadata: {doc.metadata}*") | |
| st.write("---") | |
| except Exception as e: | |
| st.error(f"An error occurred: {str(e)}") | |
| with tab2: | |
| st.header("Content Enhancement Analysis") | |
| enhancement_text = st.text_area("Enter text to analyze and enhance:", height=200, key="enhancement_input") | |
| submit_enhancement_button = st.button("Analyze & Enhance", key="enhancement_submit") | |
| if submit_enhancement_button: | |
| if not enhancement_text.strip(): | |
| st.warning("Please enter some text to analyze.") | |
| st.stop() | |
| with st.spinner("Analyzing content..."): | |
| try: | |
| enhancement_chain = enhancement_prompt | llm | |
| result = enhancement_chain.invoke({"input": enhancement_text}) | |
| result_content = result.content if hasattr(result, 'content') else str(result) | |
| st.markdown("### π Analysis Results") | |
| try: | |
| json_start = result_content.find('{') | |
| json_end = result_content.rfind('}') + 1 | |
| if json_start != -1 and json_end != -1: | |
| json_str = result_content[json_start:json_end] | |
| analysis_data = json.loads(json_str) | |
| st.markdown("#### Scores (1-10)") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| clarity_score = analysis_data.get('score', {}).get('clarity', 'N/A') | |
| st.metric("Clarity", clarity_score) | |
| with col2: | |
| struct_score = analysis_data.get('score', {}).get('structuredness', 'N/A') | |
| st.metric("Structure", struct_score) | |
| with col3: | |
| answer_score = analysis_data.get('score', {}).get('answerability', 'N/A') | |
| st.metric("Answerability", answer_score) | |
| keywords = analysis_data.get('keywords', []) | |
| if keywords: | |
| st.markdown("#### π Key Terms") | |
| st.write(", ".join(keywords)) | |
| optimized_text = analysis_data.get('optimized_text', '') | |
| if optimized_text: | |
| st.markdown("#### β¨ Optimized Content") | |
| st.text_area("Enhanced version:", value=optimized_text, height=200, key="optimized_output") | |
| else: | |
| st.markdown("#### Analysis Response") | |
| st.write(result_content) | |
| except json.JSONDecodeError: | |
| st.markdown("#### Analysis Response") | |
| st.write(result_content) | |
| except Exception as e: | |
| st.error(f"An error occurred during enhancement: {str(e)}") | |
| with tab3: | |
| st.header("π Website GEO Analysis") | |
| st.markdown("Analyze any website for Generative Engine Optimization (GEO) - how well it performs with AI search engines.") | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| website_url = st.text_input("Enter website URL:", placeholder="https://example.com") | |
| with col2: | |
| max_pages = st.selectbox("Pages to analyze:", [1, 3, 5], index=0) | |
| analyze_website_button = st.button("π Analyze Website", key="website_analyze") | |
| if analyze_website_button: | |
| if not website_url.strip(): | |
| st.warning("Please enter a website URL.") | |
| st.stop() | |
| # Add https:// if not present | |
| if not website_url.startswith(('http://', 'https://')): | |
| website_url = 'https://' + website_url | |
| with st.spinner(f"Analyzing website: {website_url}"): | |
| try: | |
| # Extract website content | |
| pages_data = extract_website_content(website_url, max_pages) | |
| if not pages_data: | |
| st.error("Could not extract content from the website.") | |
| st.stop() | |
| st.success(f"Successfully extracted content from {len(pages_data)} page(s)") | |
| # Analyze each page | |
| all_analyses = [] | |
| for i, page_data in enumerate(pages_data): | |
| with st.spinner(f"Analyzing page {i+1}/{len(pages_data)}..."): | |
| analysis = analyze_page_geo_score( | |
| page_data['content'], | |
| page_data['title'], | |
| llm | |
| ) | |
| if 'error' not in analysis: | |
| analysis['page_data'] = page_data | |
| all_analyses.append(analysis) | |
| else: | |
| st.warning(f"Could not analyze page {i+1}: {analysis['error']}") | |
| if all_analyses: | |
| # Display overall results | |
| st.markdown("## π GEO Analysis Results") | |
| # Calculate average scores | |
| avg_scores = {} | |
| score_keys = list(all_analyses[0].get('geo_scores', {}).keys()) | |
| for key in score_keys: | |
| scores = [analysis['geo_scores'][key] for analysis in all_analyses if 'geo_scores' in analysis] | |
| avg_scores[key] = sum(scores) / len(scores) if scores else 0 | |
| overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0 | |
| # Display metrics | |
| st.markdown("### π― Overall GEO Scores") | |
| # Main score | |
| col1, col2, col3 = st.columns([1, 2, 1]) | |
| with col2: | |
| st.metric("Overall GEO Score", f"{overall_avg:.1f}/10", | |
| delta=f"{overall_avg - 7.0:.1f}" if overall_avg >= 7.0 else f"{overall_avg - 7.0:.1f}") | |
| # Individual scores | |
| st.markdown("### π Detailed Metrics") | |
| col1, col2, col3, col4 = st.columns(4) | |
| metrics_display = [ | |
| ("AI Search Visibility", "ai_search_visibility"), | |
| ("Query Intent Match", "query_intent_matching"), | |
| ("Factual Accuracy", "factual_accuracy"), | |
| ("Conversational Ready", "conversational_readiness") | |
| ] | |
| for i, (display_name, key) in enumerate(metrics_display): | |
| with [col1, col2, col3, col4][i]: | |
| score = avg_scores.get(key, 0) | |
| st.metric(display_name, f"{score:.1f}") | |
| col1, col2, col3, col4 = st.columns(4) | |
| metrics_display_2 = [ | |
| ("Semantic Richness", "semantic_richness"), | |
| ("Context Complete", "context_completeness"), | |
| ("Citation Worthy", "citation_worthiness"), | |
| ("Multi-Query Cover", "multi_query_coverage") | |
| ] | |
| for i, (display_name, key) in enumerate(metrics_display_2): | |
| with [col1, col2, col3, col4][i]: | |
| score = avg_scores.get(key, 0) | |
| st.metric(display_name, f"{score:.1f}") | |
| # Recommendations | |
| st.markdown("### π‘ Optimization Recommendations") | |
| all_recommendations = [] | |
| all_opportunities = [] | |
| for analysis in all_analyses: | |
| all_recommendations.extend(analysis.get('recommendations', [])) | |
| all_opportunities.extend(analysis.get('optimization_opportunities', [])) | |
| # Remove duplicates | |
| unique_recommendations = list(set(all_recommendations)) | |
| for i, rec in enumerate(unique_recommendations[:5], 1): | |
| st.write(f"**{i}.** {rec}") | |
| # Opportunities by priority | |
| if all_opportunities: | |
| st.markdown("### π Priority Optimizations") | |
| high_priority = [opp for opp in all_opportunities if opp.get('priority') == 'high'] | |
| medium_priority = [opp for opp in all_opportunities if opp.get('priority') == 'medium'] | |
| if high_priority: | |
| st.markdown("#### π΄ High Priority") | |
| for opp in high_priority[:3]: | |
| st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', 'No description')}") | |
| if medium_priority: | |
| st.markdown("#### π‘ Medium Priority") | |
| for opp in medium_priority[:3]: | |
| st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', 'No description')}") | |
| # Detailed page analysis | |
| with st.expander("π Detailed Page Analysis"): | |
| for i, analysis in enumerate(all_analyses): | |
| page_data = analysis.get('page_data', {}) | |
| st.markdown(f"#### Page {i+1}: {page_data.get('title', 'Unknown Title')}") | |
| st.write(f"**URL**: {page_data.get('url', 'Unknown')}") | |
| st.write(f"**Word Count**: {page_data.get('word_count', 0)}") | |
| if 'primary_topics' in analysis: | |
| st.write(f"**Topics**: {', '.join(analysis['primary_topics'])}") | |
| if 'entities' in analysis: | |
| st.write(f"**Entities**: {', '.join(analysis['entities'])}") | |
| st.write("---") | |
| # Export functionality | |
| st.markdown("### π₯ Export Results") | |
| if st.button("π Generate Report"): | |
| report_data = { | |
| 'website_url': website_url, | |
| 'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S'), | |
| 'overall_score': overall_avg, | |
| 'individual_scores': avg_scores, | |
| 'recommendations': unique_recommendations, | |
| 'pages_analyzed': len(all_analyses) | |
| } | |
| st.json(report_data) | |
| st.success("Report generated! You can copy the JSON above for your records.") | |
| else: | |
| st.error("Could not analyze any pages from the website.") | |
| except Exception as e: | |
| st.error(f"An error occurred during website analysis: {str(e)}") | |
| # --- Sidebar Information --- | |
| with st.sidebar: | |
| st.markdown("---") | |
| st.markdown("### π§ Configuration") | |
| st.markdown("Set your API keys:") | |
| st.code("export GROQ_API_KEY='your-key'") | |
| st.markdown("---") | |
| st.markdown("### π GEO Metrics Explained") | |
| st.markdown("**AI Search Visibility**: Likelihood of appearing in AI search results") | |
| st.markdown("**Query Intent Matching**: How well content matches user queries") | |
| st.markdown("**Conversational Readiness**: Suitability for AI chat responses") | |
| st.markdown("**Citation Worthiness**: Probability of being cited by AI") | |
| st.markdown("---") | |
| st.markdown("### βΉοΈ About") | |
| st.markdown("This tool analyzes websites for:") | |
| st.markdown("- π€ AI search optimization") | |
| st.markdown("- π¬ LLM compatibility") | |
| st.markdown("- π GEO scoring") | |
| st.markdown("- π― Content recommendations") | |
| st.markdown("---") | |
| st.markdown("*π AI Content Optimizer - Built with Streamlit, LangChain, and Groq*") |