|
|
import os |
|
|
import tempfile |
|
|
import streamlit as st |
|
|
import json |
|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
from urllib.parse import urljoin, urlparse |
|
|
import time |
|
|
from typing import List, Dict, Any |
|
|
import pandas as pd |
|
|
|
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
from langchain_community.vectorstores import FAISS |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
from langchain.chains import RetrievalQA |
|
|
from langchain.prompts import PromptTemplate, ChatPromptTemplate |
|
|
from langchain.schema import Document |
|
|
from langchain_groq import ChatGroq |
|
|
|
|
|
|
|
|
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "your-groq-api-key") |
|
|
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "your-huggingface-api-key") |
|
|
|
|
|
|
|
|
llm = ChatGroq( |
|
|
api_key=GROQ_API_KEY, |
|
|
model_name="llama3-8b-8192", |
|
|
temperature=0.1 |
|
|
) |
|
|
|
|
|
|
|
|
embedding = HuggingFaceEmbeddings( |
|
|
model_name="sentence-transformers/all-MiniLM-L6-v2", |
|
|
cache_folder="./hf_cache", |
|
|
) |
|
|
|
|
|
|
|
|
system_prompt = """You are an AI Content Enhancement Specialist. Your purpose is to optimize user-provided text to maximize its effectiveness for large language models (LLMs) in search, question-answering, and conversational AI systems. |
|
|
|
|
|
Evaluate the input text based on the following criteria, assigning a score from 1β10 for each: |
|
|
|
|
|
Clarity: How easily can the content be understood? |
|
|
|
|
|
Structuredness: How well-organized and coherent is the content? |
|
|
|
|
|
LLM Answerability: How easily can an LLM extract precise answers from the content? |
|
|
|
|
|
Identify the most salient keywords. |
|
|
|
|
|
Rewrite the text to improve: |
|
|
|
|
|
Clarity and precision |
|
|
|
|
|
Logical structure and flow |
|
|
|
|
|
Suitability for LLM-based information retrieval |
|
|
|
|
|
Present your analysis and optimized text in the following JSON format: |
|
|
|
|
|
```json |
|
|
{ |
|
|
"score": { |
|
|
"clarity": 8.5, |
|
|
"structuredness": 7.0, |
|
|
"answerability": 9.0 |
|
|
}, |
|
|
"keywords": ["example", "installation", "setup"], |
|
|
"optimized_text": "..." |
|
|
} |
|
|
```""" |
|
|
|
|
|
|
|
|
geo_analysis_prompt = """You are a Generative Engine Optimizer (GEO) specialist. Analyze the provided website content for its effectiveness in AI-powered search engines and LLM systems. |
|
|
|
|
|
Evaluate the content based on these GEO criteria (score 1-10 each): |
|
|
|
|
|
1. **AI Search Visibility**: How likely is this content to be surfaced by AI search engines? |
|
|
2. **Query Intent Matching**: How well does the content match common user queries? |
|
|
3. **Factual Accuracy & Authority**: How trustworthy and authoritative is the information? |
|
|
4. **Conversational Readiness**: How suitable is the content for AI chat responses? |
|
|
5. **Semantic Richness**: How well does the content use relevant semantic keywords? |
|
|
6. **Context Completeness**: Does the content provide complete, self-contained answers? |
|
|
7. **Citation Worthiness**: How likely are AI systems to cite this content? |
|
|
8. **Multi-Query Coverage**: Does the content answer multiple related questions? |
|
|
|
|
|
Also identify: |
|
|
- Primary topics and entities |
|
|
- Missing information gaps |
|
|
- Optimization opportunities |
|
|
- Specific enhancement recommendations |
|
|
|
|
|
Format your response as JSON: |
|
|
|
|
|
```json |
|
|
{ |
|
|
"geo_scores": { |
|
|
"ai_search_visibility": 7.5, |
|
|
"query_intent_matching": 8.0, |
|
|
"factual_accuracy": 9.0, |
|
|
"conversational_readiness": 6.5, |
|
|
"semantic_richness": 7.0, |
|
|
"context_completeness": 8.5, |
|
|
"citation_worthiness": 7.8, |
|
|
"multi_query_coverage": 6.0 |
|
|
}, |
|
|
"overall_geo_score": 7.5, |
|
|
"primary_topics": ["topic1", "topic2"], |
|
|
"entities": ["entity1", "entity2"], |
|
|
"missing_gaps": ["gap1", "gap2"], |
|
|
"optimization_opportunities": [ |
|
|
{ |
|
|
"type": "semantic_enhancement", |
|
|
"description": "Add more related terms", |
|
|
"priority": "high" |
|
|
} |
|
|
], |
|
|
"recommendations": [ |
|
|
"Specific actionable recommendation 1", |
|
|
"Specific actionable recommendation 2" |
|
|
] |
|
|
} |
|
|
```""" |
|
|
|
|
|
|
|
|
def extract_website_content(url: str, max_pages: int = 5) -> List[Dict[str, Any]]: |
|
|
"""Extract content from website pages""" |
|
|
try: |
|
|
headers = { |
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
|
} |
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=10) |
|
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
for script in soup(["script", "style", "nav", "footer", "header"]): |
|
|
script.decompose() |
|
|
|
|
|
|
|
|
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') or soup.body |
|
|
|
|
|
if main_content: |
|
|
text_content = main_content.get_text(separator=' ', strip=True) |
|
|
else: |
|
|
text_content = soup.get_text(separator=' ', strip=True) |
|
|
|
|
|
|
|
|
lines = [line.strip() for line in text_content.split('\n') if line.strip()] |
|
|
cleaned_text = ' '.join(lines) |
|
|
|
|
|
|
|
|
title = soup.find('title').get_text() if soup.find('title') else "No Title" |
|
|
meta_desc = soup.find('meta', attrs={'name': 'description'}) |
|
|
description = meta_desc.get('content') if meta_desc else "No Description" |
|
|
|
|
|
|
|
|
headings = [] |
|
|
for i in range(1, 7): |
|
|
for heading in soup.find_all(f'h{i}'): |
|
|
headings.append({ |
|
|
'level': i, |
|
|
'text': heading.get_text(strip=True) |
|
|
}) |
|
|
|
|
|
return [{ |
|
|
'url': url, |
|
|
'title': title, |
|
|
'description': description, |
|
|
'content': cleaned_text[:10000], |
|
|
'headings': headings, |
|
|
'word_count': len(cleaned_text.split()) |
|
|
}] |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"Error scraping {url}: {str(e)}") |
|
|
return [] |
|
|
|
|
|
def analyze_page_geo_score(content: str, title: str, llm) -> Dict[str, Any]: |
|
|
"""Analyze a single page for GEO score""" |
|
|
try: |
|
|
geo_prompt = ChatPromptTemplate.from_messages([ |
|
|
("system", geo_analysis_prompt), |
|
|
("user", f"Title: {title}\n\nContent: {content}") |
|
|
]) |
|
|
|
|
|
chain = geo_prompt | llm |
|
|
result = chain.invoke({"input": f"Title: {title}\n\nContent: {content}"}) |
|
|
|
|
|
result_content = result.content if hasattr(result, 'content') else str(result) |
|
|
|
|
|
|
|
|
json_start = result_content.find('{') |
|
|
json_end = result_content.rfind('}') + 1 |
|
|
|
|
|
if json_start != -1 and json_end != -1: |
|
|
json_str = result_content[json_start:json_end] |
|
|
return json.loads(json_str) |
|
|
else: |
|
|
return {"error": "Could not parse GEO analysis"} |
|
|
|
|
|
except Exception as e: |
|
|
return {"error": f"Analysis failed: {str(e)}"} |
|
|
|
|
|
|
|
|
enhancement_prompt = ChatPromptTemplate.from_messages([ |
|
|
("system", system_prompt), |
|
|
("user", "{input}") |
|
|
]) |
|
|
|
|
|
|
|
|
st.set_page_config(page_title="AI Content Optimizer", page_icon="π", layout="wide") |
|
|
st.title("π AI Content Optimizer & GEO Analyzer") |
|
|
|
|
|
|
|
|
st.sidebar.title("π οΈ Tools") |
|
|
st.sidebar.markdown("- π Document Q&A") |
|
|
st.sidebar.markdown("- π§ Content Enhancement") |
|
|
st.sidebar.markdown("- π Website GEO Analysis") |
|
|
st.sidebar.markdown("- π SEO-like Scoring") |
|
|
|
|
|
|
|
|
tab1, tab2, tab3 = st.tabs(["π Document Chat", "π§ Content Enhancement", "π Website GEO Analysis"]) |
|
|
|
|
|
with tab1: |
|
|
st.header("Document Question Answering") |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) |
|
|
pasted_text = st.text_area("Or paste some text below:", height=150) |
|
|
user_query = st.text_input("Ask a question about the content") |
|
|
submit_qa_button = st.button("Submit Question", key="qa_submit") |
|
|
|
|
|
if submit_qa_button: |
|
|
if not user_query.strip(): |
|
|
st.warning("Please enter a question.") |
|
|
st.stop() |
|
|
|
|
|
documents = [] |
|
|
|
|
|
if uploaded_file: |
|
|
with st.spinner("Processing PDF..."): |
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: |
|
|
tmp_file.write(uploaded_file.read()) |
|
|
tmp_path = tmp_file.name |
|
|
|
|
|
loader = PyPDFLoader(tmp_path) |
|
|
documents = loader.load_and_split() |
|
|
os.unlink(tmp_path) |
|
|
|
|
|
elif pasted_text.strip(): |
|
|
documents = [Document(page_content=pasted_text)] |
|
|
else: |
|
|
st.warning("Please upload a PDF or paste some text.") |
|
|
st.stop() |
|
|
|
|
|
with st.spinner("Creating embeddings..."): |
|
|
vectorstore = FAISS.from_documents(documents, embedding) |
|
|
retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) |
|
|
|
|
|
qa_prompt_template = PromptTemplate( |
|
|
input_variables=["context", "question"], |
|
|
template="""You are an AI assistant. Use the following context to answer the question. |
|
|
Be concise, accurate, and helpful. If the answer is not in the context, say so. |
|
|
|
|
|
Context: {context} |
|
|
Question: {question} |
|
|
Answer:""" |
|
|
) |
|
|
|
|
|
qa_chain = RetrievalQA.from_chain_type( |
|
|
llm=llm, |
|
|
chain_type="stuff", |
|
|
retriever=retriever, |
|
|
return_source_documents=True, |
|
|
chain_type_kwargs={"prompt": qa_prompt_template} |
|
|
) |
|
|
|
|
|
with st.spinner("Generating answer..."): |
|
|
try: |
|
|
result = qa_chain({"query": user_query}) |
|
|
st.markdown("### π¬ Answer") |
|
|
st.write(result["result"]) |
|
|
|
|
|
with st.expander("π Source Documents"): |
|
|
for i, doc in enumerate(result["source_documents"]): |
|
|
st.write(f"**Source {i+1}:**") |
|
|
st.write(doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content) |
|
|
if hasattr(doc, 'metadata') and doc.metadata: |
|
|
st.write(f"*Metadata: {doc.metadata}*") |
|
|
st.write("---") |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"An error occurred: {str(e)}") |
|
|
|
|
|
with tab2: |
|
|
st.header("Content Enhancement Analysis") |
|
|
enhancement_text = st.text_area("Enter text to analyze and enhance:", height=200, key="enhancement_input") |
|
|
submit_enhancement_button = st.button("Analyze & Enhance", key="enhancement_submit") |
|
|
|
|
|
if submit_enhancement_button: |
|
|
if not enhancement_text.strip(): |
|
|
st.warning("Please enter some text to analyze.") |
|
|
st.stop() |
|
|
|
|
|
with st.spinner("Analyzing content..."): |
|
|
try: |
|
|
enhancement_chain = enhancement_prompt | llm |
|
|
result = enhancement_chain.invoke({"input": enhancement_text}) |
|
|
result_content = result.content if hasattr(result, 'content') else str(result) |
|
|
|
|
|
st.markdown("### π Analysis Results") |
|
|
|
|
|
try: |
|
|
json_start = result_content.find('{') |
|
|
json_end = result_content.rfind('}') + 1 |
|
|
|
|
|
if json_start != -1 and json_end != -1: |
|
|
json_str = result_content[json_start:json_end] |
|
|
analysis_data = json.loads(json_str) |
|
|
|
|
|
st.markdown("#### Scores (1-10)") |
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
|
|
with col1: |
|
|
clarity_score = analysis_data.get('score', {}).get('clarity', 'N/A') |
|
|
st.metric("Clarity", clarity_score) |
|
|
|
|
|
with col2: |
|
|
struct_score = analysis_data.get('score', {}).get('structuredness', 'N/A') |
|
|
st.metric("Structure", struct_score) |
|
|
|
|
|
with col3: |
|
|
answer_score = analysis_data.get('score', {}).get('answerability', 'N/A') |
|
|
st.metric("Answerability", answer_score) |
|
|
|
|
|
keywords = analysis_data.get('keywords', []) |
|
|
if keywords: |
|
|
st.markdown("#### π Key Terms") |
|
|
st.write(", ".join(keywords)) |
|
|
|
|
|
optimized_text = analysis_data.get('optimized_text', '') |
|
|
if optimized_text: |
|
|
st.markdown("#### β¨ Optimized Content") |
|
|
st.text_area("Enhanced version:", value=optimized_text, height=200, key="optimized_output") |
|
|
else: |
|
|
st.markdown("#### Analysis Response") |
|
|
st.write(result_content) |
|
|
|
|
|
except json.JSONDecodeError: |
|
|
st.markdown("#### Analysis Response") |
|
|
st.write(result_content) |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"An error occurred during enhancement: {str(e)}") |
|
|
|
|
|
with tab3: |
|
|
st.header("π Website GEO Analysis") |
|
|
st.markdown("Analyze any website for Generative Engine Optimization (GEO) - how well it performs with AI search engines.") |
|
|
|
|
|
col1, col2 = st.columns([2, 1]) |
|
|
|
|
|
with col1: |
|
|
website_url = st.text_input("Enter website URL:", placeholder="https://example.com") |
|
|
|
|
|
with col2: |
|
|
max_pages = st.selectbox("Pages to analyze:", [1, 3, 5], index=0) |
|
|
|
|
|
analyze_website_button = st.button("π Analyze Website", key="website_analyze") |
|
|
|
|
|
if analyze_website_button: |
|
|
if not website_url.strip(): |
|
|
st.warning("Please enter a website URL.") |
|
|
st.stop() |
|
|
|
|
|
|
|
|
if not website_url.startswith(('http://', 'https://')): |
|
|
website_url = 'https://' + website_url |
|
|
|
|
|
with st.spinner(f"Analyzing website: {website_url}"): |
|
|
try: |
|
|
|
|
|
pages_data = extract_website_content(website_url, max_pages) |
|
|
|
|
|
if not pages_data: |
|
|
st.error("Could not extract content from the website.") |
|
|
st.stop() |
|
|
|
|
|
st.success(f"Successfully extracted content from {len(pages_data)} page(s)") |
|
|
|
|
|
|
|
|
all_analyses = [] |
|
|
|
|
|
for i, page_data in enumerate(pages_data): |
|
|
with st.spinner(f"Analyzing page {i+1}/{len(pages_data)}..."): |
|
|
analysis = analyze_page_geo_score( |
|
|
page_data['content'], |
|
|
page_data['title'], |
|
|
llm |
|
|
) |
|
|
|
|
|
if 'error' not in analysis: |
|
|
analysis['page_data'] = page_data |
|
|
all_analyses.append(analysis) |
|
|
else: |
|
|
st.warning(f"Could not analyze page {i+1}: {analysis['error']}") |
|
|
|
|
|
if all_analyses: |
|
|
|
|
|
st.markdown("## π GEO Analysis Results") |
|
|
|
|
|
|
|
|
avg_scores = {} |
|
|
score_keys = list(all_analyses[0].get('geo_scores', {}).keys()) |
|
|
|
|
|
for key in score_keys: |
|
|
scores = [analysis['geo_scores'][key] for analysis in all_analyses if 'geo_scores' in analysis] |
|
|
avg_scores[key] = sum(scores) / len(scores) if scores else 0 |
|
|
|
|
|
overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0 |
|
|
|
|
|
|
|
|
st.markdown("### π― Overall GEO Scores") |
|
|
|
|
|
|
|
|
col1, col2, col3 = st.columns([1, 2, 1]) |
|
|
with col2: |
|
|
st.metric("Overall GEO Score", f"{overall_avg:.1f}/10", |
|
|
delta=f"{overall_avg - 7.0:.1f}" if overall_avg >= 7.0 else f"{overall_avg - 7.0:.1f}") |
|
|
|
|
|
|
|
|
st.markdown("### π Detailed Metrics") |
|
|
col1, col2, col3, col4 = st.columns(4) |
|
|
|
|
|
metrics_display = [ |
|
|
("AI Search Visibility", "ai_search_visibility"), |
|
|
("Query Intent Match", "query_intent_matching"), |
|
|
("Factual Accuracy", "factual_accuracy"), |
|
|
("Conversational Ready", "conversational_readiness") |
|
|
] |
|
|
|
|
|
for i, (display_name, key) in enumerate(metrics_display): |
|
|
with [col1, col2, col3, col4][i]: |
|
|
score = avg_scores.get(key, 0) |
|
|
st.metric(display_name, f"{score:.1f}") |
|
|
|
|
|
col1, col2, col3, col4 = st.columns(4) |
|
|
|
|
|
metrics_display_2 = [ |
|
|
("Semantic Richness", "semantic_richness"), |
|
|
("Context Complete", "context_completeness"), |
|
|
("Citation Worthy", "citation_worthiness"), |
|
|
("Multi-Query Cover", "multi_query_coverage") |
|
|
] |
|
|
|
|
|
for i, (display_name, key) in enumerate(metrics_display_2): |
|
|
with [col1, col2, col3, col4][i]: |
|
|
score = avg_scores.get(key, 0) |
|
|
st.metric(display_name, f"{score:.1f}") |
|
|
|
|
|
|
|
|
st.markdown("### π‘ Optimization Recommendations") |
|
|
|
|
|
all_recommendations = [] |
|
|
all_opportunities = [] |
|
|
|
|
|
for analysis in all_analyses: |
|
|
all_recommendations.extend(analysis.get('recommendations', [])) |
|
|
all_opportunities.extend(analysis.get('optimization_opportunities', [])) |
|
|
|
|
|
|
|
|
unique_recommendations = list(set(all_recommendations)) |
|
|
|
|
|
for i, rec in enumerate(unique_recommendations[:5], 1): |
|
|
st.write(f"**{i}.** {rec}") |
|
|
|
|
|
|
|
|
if all_opportunities: |
|
|
st.markdown("### π Priority Optimizations") |
|
|
|
|
|
high_priority = [opp for opp in all_opportunities if opp.get('priority') == 'high'] |
|
|
medium_priority = [opp for opp in all_opportunities if opp.get('priority') == 'medium'] |
|
|
|
|
|
if high_priority: |
|
|
st.markdown("#### π΄ High Priority") |
|
|
for opp in high_priority[:3]: |
|
|
st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', 'No description')}") |
|
|
|
|
|
if medium_priority: |
|
|
st.markdown("#### π‘ Medium Priority") |
|
|
for opp in medium_priority[:3]: |
|
|
st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', 'No description')}") |
|
|
|
|
|
|
|
|
with st.expander("π Detailed Page Analysis"): |
|
|
for i, analysis in enumerate(all_analyses): |
|
|
page_data = analysis.get('page_data', {}) |
|
|
st.markdown(f"#### Page {i+1}: {page_data.get('title', 'Unknown Title')}") |
|
|
st.write(f"**URL**: {page_data.get('url', 'Unknown')}") |
|
|
st.write(f"**Word Count**: {page_data.get('word_count', 0)}") |
|
|
|
|
|
if 'primary_topics' in analysis: |
|
|
st.write(f"**Topics**: {', '.join(analysis['primary_topics'])}") |
|
|
|
|
|
if 'entities' in analysis: |
|
|
st.write(f"**Entities**: {', '.join(analysis['entities'])}") |
|
|
|
|
|
st.write("---") |
|
|
|
|
|
|
|
|
st.markdown("### π₯ Export Results") |
|
|
|
|
|
if st.button("π Generate Report"): |
|
|
report_data = { |
|
|
'website_url': website_url, |
|
|
'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S'), |
|
|
'overall_score': overall_avg, |
|
|
'individual_scores': avg_scores, |
|
|
'recommendations': unique_recommendations, |
|
|
'pages_analyzed': len(all_analyses) |
|
|
} |
|
|
|
|
|
st.json(report_data) |
|
|
st.success("Report generated! You can copy the JSON above for your records.") |
|
|
|
|
|
else: |
|
|
st.error("Could not analyze any pages from the website.") |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"An error occurred during website analysis: {str(e)}") |
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
st.markdown("---") |
|
|
st.markdown("### π§ Configuration") |
|
|
st.markdown("Set your API keys:") |
|
|
st.code("export GROQ_API_KEY='your-key'") |
|
|
|
|
|
st.markdown("---") |
|
|
st.markdown("### π GEO Metrics Explained") |
|
|
st.markdown("**AI Search Visibility**: Likelihood of appearing in AI search results") |
|
|
st.markdown("**Query Intent Matching**: How well content matches user queries") |
|
|
st.markdown("**Conversational Readiness**: Suitability for AI chat responses") |
|
|
st.markdown("**Citation Worthiness**: Probability of being cited by AI") |
|
|
|
|
|
st.markdown("---") |
|
|
st.markdown("### βΉοΈ About") |
|
|
st.markdown("This tool analyzes websites for:") |
|
|
st.markdown("- π€ AI search optimization") |
|
|
st.markdown("- π¬ LLM compatibility") |
|
|
st.markdown("- π GEO scoring") |
|
|
st.markdown("- π― Content recommendations") |
|
|
|
|
|
st.markdown("---") |
|
|
st.markdown("*π AI Content Optimizer - Built with Streamlit, LangChain, and Groq*") |