MBilal-72 commited on
Commit
68b0980
Β·
verified Β·
1 Parent(s): 18aaeb9

update app.py after utils

Browse files
Files changed (1) hide show
  1. app.py +459 -506
app.py CHANGED
@@ -1,555 +1,508 @@
 
 
 
 
 
 
1
  import os
2
  import tempfile
3
- import streamlit as st
4
  import json
5
- import requests
6
- from bs4 import BeautifulSoup
7
- from urllib.parse import urljoin, urlparse
8
- import time
9
- from typing import List, Dict, Any
10
- import pandas as pd
11
-
12
- from langchain_community.document_loaders import PyPDFLoader
13
- from langchain_community.vectorstores import FAISS
14
- from langchain_community.embeddings import HuggingFaceEmbeddings
15
- from langchain.chains import RetrievalQA
16
- from langchain.prompts import PromptTemplate, ChatPromptTemplate
17
- from langchain.schema import Document
18
- from langchain_groq import ChatGroq
19
-
20
- # --- Environment Variables ---
21
- GROQ_API_KEY = os.getenv("GROQ_API_KEY", "your-groq-api-key")
22
- HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "your-huggingface-api-key")
23
-
24
- # --- Initialize Groq LLM ---
25
- llm = ChatGroq(
26
- api_key=GROQ_API_KEY,
27
- model_name="llama3-8b-8192",
28
- temperature=0.1
29
- )
30
-
31
- # --- HuggingFace Embeddings ---
32
- embedding = HuggingFaceEmbeddings(
33
- model_name="sentence-transformers/all-MiniLM-L6-v2",
34
- cache_folder="./hf_cache",
35
- )
36
-
37
- # --- System Prompt for Content Enhancement ---
38
- system_prompt = """You are an AI Content Enhancement Specialist. Your purpose is to optimize user-provided text to maximize its effectiveness for large language models (LLMs) in search, question-answering, and conversational AI systems.
39
-
40
- Evaluate the input text based on the following criteria, assigning a score from 1–10 for each:
41
-
42
- Clarity: How easily can the content be understood?
43
-
44
- Structuredness: How well-organized and coherent is the content?
45
-
46
- LLM Answerability: How easily can an LLM extract precise answers from the content?
47
-
48
- Identify the most salient keywords.
49
-
50
- Rewrite the text to improve:
51
-
52
- Clarity and precision
53
-
54
- Logical structure and flow
55
-
56
- Suitability for LLM-based information retrieval
57
-
58
- Present your analysis and optimized text in the following JSON format:
59
 
60
- ```json
61
- {
62
- "score": {
63
- "clarity": 8.5,
64
- "structuredness": 7.0,
65
- "answerability": 9.0
66
- },
67
- "keywords": ["example", "installation", "setup"],
68
- "optimized_text": "..."
69
- }
70
- ```"""
71
 
72
- # --- GEO Analysis System Prompt ---
73
- geo_analysis_prompt = """You are a Generative Engine Optimizer (GEO) specialist. Analyze the provided website content for its effectiveness in AI-powered search engines and LLM systems.
74
-
75
- Evaluate the content based on these GEO criteria (score 1-10 each):
76
-
77
- 1. **AI Search Visibility**: How likely is this content to be surfaced by AI search engines?
78
- 2. **Query Intent Matching**: How well does the content match common user queries?
79
- 3. **Factual Accuracy & Authority**: How trustworthy and authoritative is the information?
80
- 4. **Conversational Readiness**: How suitable is the content for AI chat responses?
81
- 5. **Semantic Richness**: How well does the content use relevant semantic keywords?
82
- 6. **Context Completeness**: Does the content provide complete, self-contained answers?
83
- 7. **Citation Worthiness**: How likely are AI systems to cite this content?
84
- 8. **Multi-Query Coverage**: Does the content answer multiple related questions?
85
-
86
- Also identify:
87
- - Primary topics and entities
88
- - Missing information gaps
89
- - Optimization opportunities
90
- - Specific enhancement recommendations
91
-
92
- Format your response as JSON:
93
-
94
- ```json
95
- {
96
- "geo_scores": {
97
- "ai_search_visibility": 7.5,
98
- "query_intent_matching": 8.0,
99
- "factual_accuracy": 9.0,
100
- "conversational_readiness": 6.5,
101
- "semantic_richness": 7.0,
102
- "context_completeness": 8.5,
103
- "citation_worthiness": 7.8,
104
- "multi_query_coverage": 6.0
105
- },
106
- "overall_geo_score": 7.5,
107
- "primary_topics": ["topic1", "topic2"],
108
- "entities": ["entity1", "entity2"],
109
- "missing_gaps": ["gap1", "gap2"],
110
- "optimization_opportunities": [
111
- {
112
- "type": "semantic_enhancement",
113
- "description": "Add more related terms",
114
- "priority": "high"
115
- }
116
- ],
117
- "recommendations": [
118
- "Specific actionable recommendation 1",
119
- "Specific actionable recommendation 2"
120
- ]
121
- }
122
- ```"""
123
-
124
- # --- Website Scraping Functions ---
125
- def extract_website_content(url: str, max_pages: int = 5) -> List[Dict[str, Any]]:
126
- """Extract content from website pages"""
127
- try:
128
- headers = {
129
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
130
- }
131
-
132
- response = requests.get(url, headers=headers, timeout=10)
133
- response.raise_for_status()
134
-
135
- soup = BeautifulSoup(response.content, 'html.parser')
136
-
137
- # Remove script and style elements
138
- for script in soup(["script", "style", "nav", "footer", "header"]):
139
- script.decompose()
140
-
141
- # Extract main content
142
- main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') or soup.body
143
-
144
- if main_content:
145
- text_content = main_content.get_text(separator=' ', strip=True)
146
- else:
147
- text_content = soup.get_text(separator=' ', strip=True)
148
-
149
- # Clean up text
150
- lines = [line.strip() for line in text_content.split('\n') if line.strip()]
151
- cleaned_text = ' '.join(lines)
152
-
153
- # Extract metadata
154
- title = soup.find('title').get_text() if soup.find('title') else "No Title"
155
- meta_desc = soup.find('meta', attrs={'name': 'description'})
156
- description = meta_desc.get('content') if meta_desc else "No Description"
157
-
158
- # Extract headings
159
- headings = []
160
- for i in range(1, 7):
161
- for heading in soup.find_all(f'h{i}'):
162
- headings.append({
163
- 'level': i,
164
- 'text': heading.get_text(strip=True)
165
- })
166
-
167
- return [{
168
- 'url': url,
169
- 'title': title,
170
- 'description': description,
171
- 'content': cleaned_text[:10000], # Limit content length
172
- 'headings': headings,
173
- 'word_count': len(cleaned_text.split())
174
- }]
175
-
176
- except Exception as e:
177
- st.error(f"Error scraping {url}: {str(e)}")
178
- return []
179
 
180
- def analyze_page_geo_score(content: str, title: str, llm) -> Dict[str, Any]:
181
- """Analyze a single page for GEO score"""
182
- try:
183
- geo_prompt = ChatPromptTemplate.from_messages([
184
- ("system", geo_analysis_prompt),
185
- ("user", f"Title: {title}\n\nContent: {content}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  ])
187
 
188
- chain = geo_prompt | llm
189
- result = chain.invoke({"input": f"Title: {title}\n\nContent: {content}"})
190
 
191
- result_content = result.content if hasattr(result, 'content') else str(result)
 
192
 
193
- # Extract JSON from response
194
- json_start = result_content.find('{')
195
- json_end = result_content.rfind('}') + 1
 
 
 
 
 
 
 
196
 
197
- if json_start != -1 and json_end != -1:
198
- json_str = result_content[json_start:json_end]
199
- return json.loads(json_str)
200
- else:
201
- return {"error": "Could not parse GEO analysis"}
202
-
203
- except Exception as e:
204
- return {"error": f"Analysis failed: {str(e)}"}
205
-
206
- # --- Create Chat Prompt Template for Content Enhancement ---
207
- enhancement_prompt = ChatPromptTemplate.from_messages([
208
- ("system", system_prompt),
209
- ("user", "{input}")
210
- ])
211
-
212
- # --- Streamlit UI ---
213
- st.set_page_config(page_title="AI Content Optimizer", page_icon="πŸš€", layout="wide")
214
- st.title("πŸš€ AI Content Optimizer & GEO Analyzer")
215
-
216
- # Sidebar
217
- st.sidebar.title("πŸ› οΈ Tools")
218
- st.sidebar.markdown("- πŸ“„ Document Q&A")
219
- st.sidebar.markdown("- πŸ”§ Content Enhancement")
220
- st.sidebar.markdown("- 🌐 Website GEO Analysis")
221
- st.sidebar.markdown("- πŸ“Š SEO-like Scoring")
222
-
223
- # Create tabs
224
- tab1, tab2, tab3 = st.tabs(["πŸ“„ Document Chat", "πŸ”§ Content Enhancement", "🌐 Website GEO Analysis"])
225
-
226
- with tab1:
227
- st.header("Document Question Answering")
228
 
229
- uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
230
- pasted_text = st.text_area("Or paste some text below:", height=150)
231
- user_query = st.text_input("Ask a question about the content")
232
- submit_qa_button = st.button("Submit Question", key="qa_submit")
233
-
234
- if submit_qa_button:
235
- if not user_query.strip():
236
- st.warning("Please enter a question.")
237
- st.stop()
 
 
 
 
 
 
 
 
 
 
238
 
239
- documents = []
240
-
241
- if uploaded_file:
242
- with st.spinner("Processing PDF..."):
243
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
244
- tmp_file.write(uploaded_file.read())
245
- tmp_path = tmp_file.name
246
-
247
- loader = PyPDFLoader(tmp_path)
248
- documents = loader.load_and_split()
249
- os.unlink(tmp_path)
250
-
251
- elif pasted_text.strip():
252
- documents = [Document(page_content=pasted_text)]
253
- else:
254
- st.warning("Please upload a PDF or paste some text.")
255
- st.stop()
256
-
257
- with st.spinner("Creating embeddings..."):
258
- vectorstore = FAISS.from_documents(documents, embedding)
259
- retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
260
-
261
- qa_prompt_template = PromptTemplate(
262
- input_variables=["context", "question"],
263
- template="""You are an AI assistant. Use the following context to answer the question.
264
- Be concise, accurate, and helpful. If the answer is not in the context, say so.
265
-
266
- Context: {context}
267
- Question: {question}
268
- Answer:"""
269
- )
270
-
271
- qa_chain = RetrievalQA.from_chain_type(
272
- llm=llm,
273
- chain_type="stuff",
274
- retriever=retriever,
275
- return_source_documents=True,
276
- chain_type_kwargs={"prompt": qa_prompt_template}
277
- )
278
-
279
- with st.spinner("Generating answer..."):
280
  try:
281
- result = qa_chain({"query": user_query})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  st.markdown("### πŸ’¬ Answer")
283
  st.write(result["result"])
284
-
 
285
  with st.expander("πŸ“„ Source Documents"):
286
- for i, doc in enumerate(result["source_documents"]):
287
  st.write(f"**Source {i+1}:**")
288
- st.write(doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content)
 
289
  if hasattr(doc, 'metadata') and doc.metadata:
290
  st.write(f"*Metadata: {doc.metadata}*")
291
  st.write("---")
292
-
293
  except Exception as e:
294
  st.error(f"An error occurred: {str(e)}")
295
-
296
- with tab2:
297
- st.header("Content Enhancement Analysis")
298
- enhancement_text = st.text_area("Enter text to analyze and enhance:", height=200, key="enhancement_input")
299
- submit_enhancement_button = st.button("Analyze & Enhance", key="enhancement_submit")
300
 
301
- if submit_enhancement_button:
302
- if not enhancement_text.strip():
303
- st.warning("Please enter some text to analyze.")
304
- st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
- with st.spinner("Analyzing content..."):
307
  try:
308
- enhancement_chain = enhancement_prompt | llm
309
- result = enhancement_chain.invoke({"input": enhancement_text})
310
- result_content = result.content if hasattr(result, 'content') else str(result)
 
 
 
 
 
 
 
 
311
 
 
312
  st.markdown("### πŸ“Š Analysis Results")
313
 
314
- try:
315
- json_start = result_content.find('{')
316
- json_end = result_content.rfind('}') + 1
 
317
 
318
- if json_start != -1 and json_end != -1:
319
- json_str = result_content[json_start:json_end]
320
- analysis_data = json.loads(json_str)
321
-
322
- st.markdown("#### Scores (1-10)")
323
- col1, col2, col3 = st.columns(3)
324
-
325
- with col1:
326
- clarity_score = analysis_data.get('score', {}).get('clarity', 'N/A')
327
- st.metric("Clarity", clarity_score)
328
-
329
- with col2:
330
- struct_score = analysis_data.get('score', {}).get('structuredness', 'N/A')
331
- st.metric("Structure", struct_score)
332
-
333
- with col3:
334
- answer_score = analysis_data.get('score', {}).get('answerability', 'N/A')
335
- st.metric("Answerability", answer_score)
336
-
337
- keywords = analysis_data.get('keywords', [])
338
- if keywords:
339
- st.markdown("#### πŸ”‘ Key Terms")
340
- st.write(", ".join(keywords))
341
-
342
- optimized_text = analysis_data.get('optimized_text', '')
343
- if optimized_text:
344
- st.markdown("#### ✨ Optimized Content")
345
- st.text_area("Enhanced version:", value=optimized_text, height=200, key="optimized_output")
346
- else:
347
- st.markdown("#### Analysis Response")
348
- st.write(result_content)
349
-
350
- except json.JSONDecodeError:
351
- st.markdown("#### Analysis Response")
352
- st.write(result_content)
353
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  except Exception as e:
355
- st.error(f"An error occurred during enhancement: {str(e)}")
356
-
357
- with tab3:
358
- st.header("🌐 Website GEO Analysis")
359
- st.markdown("Analyze any website for Generative Engine Optimization (GEO) - how well it performs with AI search engines.")
360
-
361
- col1, col2 = st.columns([2, 1])
362
 
363
- with col1:
364
- website_url = st.text_input("Enter website URL:", placeholder="https://example.com")
 
 
365
 
366
- with col2:
367
- max_pages = st.selectbox("Pages to analyze:", [1, 3, 5], index=0)
368
-
369
- analyze_website_button = st.button("πŸ” Analyze Website", key="website_analyze")
370
-
371
- if analyze_website_button:
372
- if not website_url.strip():
373
- st.warning("Please enter a website URL.")
374
- st.stop()
375
-
376
- # Add https:// if not present
377
- if not website_url.startswith(('http://', 'https://')):
378
- website_url = 'https://' + website_url
 
 
 
 
 
 
 
 
 
 
 
379
 
380
- with st.spinner(f"Analyzing website: {website_url}"):
381
  try:
382
- # Extract website content
383
- pages_data = extract_website_content(website_url, max_pages)
 
384
 
385
- if not pages_data:
386
- st.error("Could not extract content from the website.")
387
- st.stop()
388
-
389
- st.success(f"Successfully extracted content from {len(pages_data)} page(s)")
390
-
391
- # Analyze each page
392
- all_analyses = []
393
-
394
- for i, page_data in enumerate(pages_data):
395
- with st.spinner(f"Analyzing page {i+1}/{len(pages_data)}..."):
396
- analysis = analyze_page_geo_score(
397
- page_data['content'],
398
- page_data['title'],
399
- llm
400
- )
401
-
402
- if 'error' not in analysis:
403
- analysis['page_data'] = page_data
404
- all_analyses.append(analysis)
405
- else:
406
- st.warning(f"Could not analyze page {i+1}: {analysis['error']}")
407
-
408
- if all_analyses:
409
- # Display overall results
410
- st.markdown("## πŸ“Š GEO Analysis Results")
411
-
412
- # Calculate average scores
413
- avg_scores = {}
414
- score_keys = list(all_analyses[0].get('geo_scores', {}).keys())
415
-
416
- for key in score_keys:
417
- scores = [analysis['geo_scores'][key] for analysis in all_analyses if 'geo_scores' in analysis]
418
- avg_scores[key] = sum(scores) / len(scores) if scores else 0
419
-
420
- overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
421
-
422
- # Display metrics
423
- st.markdown("### 🎯 Overall GEO Scores")
424
-
425
- # Main score
426
- col1, col2, col3 = st.columns([1, 2, 1])
427
- with col2:
428
- st.metric("Overall GEO Score", f"{overall_avg:.1f}/10",
429
- delta=f"{overall_avg - 7.0:.1f}" if overall_avg >= 7.0 else f"{overall_avg - 7.0:.1f}")
430
-
431
- # Individual scores
432
- st.markdown("### πŸ“ˆ Detailed Metrics")
433
- col1, col2, col3, col4 = st.columns(4)
434
-
435
- metrics_display = [
436
- ("AI Search Visibility", "ai_search_visibility"),
437
- ("Query Intent Match", "query_intent_matching"),
438
- ("Factual Accuracy", "factual_accuracy"),
439
- ("Conversational Ready", "conversational_readiness")
440
- ]
441
-
442
- for i, (display_name, key) in enumerate(metrics_display):
443
- with [col1, col2, col3, col4][i]:
444
- score = avg_scores.get(key, 0)
445
- st.metric(display_name, f"{score:.1f}")
446
-
447
- col1, col2, col3, col4 = st.columns(4)
448
-
449
- metrics_display_2 = [
450
- ("Semantic Richness", "semantic_richness"),
451
- ("Context Complete", "context_completeness"),
452
- ("Citation Worthy", "citation_worthiness"),
453
- ("Multi-Query Cover", "multi_query_coverage")
454
- ]
455
-
456
- for i, (display_name, key) in enumerate(metrics_display_2):
457
- with [col1, col2, col3, col4][i]:
458
- score = avg_scores.get(key, 0)
459
- st.metric(display_name, f"{score:.1f}")
460
-
461
- # Recommendations
462
- st.markdown("### πŸ’‘ Optimization Recommendations")
463
-
464
- all_recommendations = []
465
- all_opportunities = []
466
-
467
- for analysis in all_analyses:
468
- all_recommendations.extend(analysis.get('recommendations', []))
469
- all_opportunities.extend(analysis.get('optimization_opportunities', []))
470
-
471
- # Remove duplicates
472
- unique_recommendations = list(set(all_recommendations))
473
 
474
- for i, rec in enumerate(unique_recommendations[:5], 1):
475
- st.write(f"**{i}.** {rec}")
 
476
 
477
- # Opportunities by priority
478
- if all_opportunities:
479
- st.markdown("### πŸš€ Priority Optimizations")
480
-
481
- high_priority = [opp for opp in all_opportunities if opp.get('priority') == 'high']
482
- medium_priority = [opp for opp in all_opportunities if opp.get('priority') == 'medium']
483
-
484
- if high_priority:
485
- st.markdown("#### πŸ”΄ High Priority")
486
- for opp in high_priority[:3]:
487
- st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', 'No description')}")
488
-
489
- if medium_priority:
490
- st.markdown("#### 🟑 Medium Priority")
491
- for opp in medium_priority[:3]:
492
- st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', 'No description')}")
493
 
494
- # Detailed page analysis
495
- with st.expander("πŸ“‹ Detailed Page Analysis"):
496
- for i, analysis in enumerate(all_analyses):
497
- page_data = analysis.get('page_data', {})
498
- st.markdown(f"#### Page {i+1}: {page_data.get('title', 'Unknown Title')}")
499
- st.write(f"**URL**: {page_data.get('url', 'Unknown')}")
500
- st.write(f"**Word Count**: {page_data.get('word_count', 0)}")
501
-
502
- if 'primary_topics' in analysis:
503
- st.write(f"**Topics**: {', '.join(analysis['primary_topics'])}")
504
-
505
- if 'entities' in analysis:
506
- st.write(f"**Entities**: {', '.join(analysis['entities'])}")
507
 
508
- st.write("---")
509
-
510
- # Export functionality
511
- st.markdown("### πŸ“₯ Export Results")
512
-
513
- if st.button("πŸ“Š Generate Report"):
514
- report_data = {
515
- 'website_url': website_url,
516
- 'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S'),
517
- 'overall_score': overall_avg,
518
- 'individual_scores': avg_scores,
519
- 'recommendations': unique_recommendations,
520
- 'pages_analyzed': len(all_analyses)
521
- }
522
-
523
- st.json(report_data)
524
- st.success("Report generated! You can copy the JSON above for your records.")
525
 
526
- else:
527
  st.error("Could not analyze any pages from the website.")
 
 
 
 
 
 
 
 
 
 
 
 
528
 
 
 
 
 
 
 
 
529
  except Exception as e:
530
  st.error(f"An error occurred during website analysis: {str(e)}")
531
-
532
- # --- Sidebar Information ---
533
- with st.sidebar:
534
- st.markdown("---")
535
- st.markdown("### πŸ”§ Configuration")
536
- st.markdown("Set your API keys:")
537
- st.code("export GROQ_API_KEY='your-key'")
538
 
539
- st.markdown("---")
540
- st.markdown("### πŸ“– GEO Metrics Explained")
541
- st.markdown("**AI Search Visibility**: Likelihood of appearing in AI search results")
542
- st.markdown("**Query Intent Matching**: How well content matches user queries")
543
- st.markdown("**Conversational Readiness**: Suitability for AI chat responses")
544
- st.markdown("**Citation Worthiness**: Probability of being cited by AI")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
 
546
- st.markdown("---")
547
- st.markdown("### ℹ️ About")
548
- st.markdown("This tool analyzes websites for:")
549
- st.markdown("- πŸ€– AI search optimization")
550
- st.markdown("- πŸ’¬ LLM compatibility")
551
- st.markdown("- πŸ“Š GEO scoring")
552
- st.markdown("- 🎯 Content recommendations")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
553
 
554
- st.markdown("---")
555
- st.markdown("*πŸš€ AI Content Optimizer - Built with Streamlit, LangChain, and Groq*")
 
1
+ """
2
+ Main Streamlit Application - GEO SEO AI Optimizer
3
+ Entry point for the application with UI components
4
+ """
5
+
6
+ import streamlit as st
7
  import os
8
  import tempfile
 
9
  import json
10
+ from typing import Dict, Any, List
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # Import our custom modules
13
+ from utils.parser import PDFParser, TextParser, WebpageParser
14
+ from utils.scorer import GEOScorer
15
+ from utils.optimizer import ContentOptimizer
16
+ from utils.chunker import VectorChunker
17
+ from utils.export import ResultExporter
 
 
 
 
 
18
 
19
+ # Import LangChain components
20
+ from langchain_groq import ChatGroq
21
+ from langchain_community.embeddings import HuggingFaceEmbeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ class GEOSEOApp:
24
+ """Main application class that orchestrates all components"""
25
+
26
+ def __init__(self):
27
+ self.setup_config()
28
+ self.setup_models()
29
+ self.setup_parsers()
30
+ self.setup_components()
31
+
32
+ def setup_config(self):
33
+ """Initialize configuration and API keys"""
34
+ self.groq_api_key = os.getenv("GROQ_API_KEY", "your-groq-api-key")
35
+ self.hf_api_key = os.getenv("HUGGINGFACE_API_KEY", "your-huggingface-api-key")
36
+
37
+ # Create data directory if it doesn't exist
38
+ os.makedirs("data/uploaded_files", exist_ok=True)
39
+
40
+ def setup_models(self):
41
+ """Initialize LLM and embedding models"""
42
+ self.llm = ChatGroq(
43
+ api_key=self.groq_api_key,
44
+ model_name="llama3-8b-8192",
45
+ temperature=0.1
46
+ )
47
+
48
+ self.embeddings = HuggingFaceEmbeddings(
49
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
50
+ cache_folder="./hf_cache",
51
+ )
52
+
53
+ def setup_parsers(self):
54
+ """Initialize content parsers"""
55
+ self.pdf_parser = PDFParser()
56
+ self.text_parser = TextParser()
57
+ self.webpage_parser = WebpageParser()
58
+
59
+ def setup_components(self):
60
+ """Initialize processing components"""
61
+ self.geo_scorer = GEOScorer(self.llm)
62
+ self.content_optimizer = ContentOptimizer(self.llm)
63
+ self.vector_chunker = VectorChunker(self.embeddings)
64
+ self.result_exporter = ResultExporter()
65
+
66
+ def run(self):
67
+ """Main application runner"""
68
+ st.set_page_config(
69
+ page_title="GEO SEO AI Optimizer",
70
+ page_icon="πŸš€",
71
+ layout="wide"
72
+ )
73
+
74
+ st.title("πŸš€ GEO SEO AI Optimizer")
75
+ st.markdown("*Optimize your content for AI search engines and LLM systems*")
76
+
77
+ # Sidebar
78
+ self.render_sidebar()
79
+
80
+ # Main tabs
81
+ tab1, tab2, tab3 = st.tabs([
82
+ "πŸ“„ Document Q&A",
83
+ "πŸ”§ Content Enhancement",
84
+ "🌐 Website GEO Analysis"
85
  ])
86
 
87
+ with tab1:
88
+ self.render_document_qa_tab()
89
 
90
+ with tab2:
91
+ self.render_content_enhancement_tab()
92
 
93
+ with tab3:
94
+ self.render_website_analysis_tab()
95
+
96
+ def render_sidebar(self):
97
+ """Render sidebar with information and controls"""
98
+ st.sidebar.title("πŸ› οΈ GEO Tools")
99
+ st.sidebar.markdown("- πŸ“„ Document Q&A with RAG")
100
+ st.sidebar.markdown("- πŸ”§ Content Enhancement")
101
+ st.sidebar.markdown("- 🌐 Website GEO Analysis")
102
+ st.sidebar.markdown("- πŸ“Š AI-First SEO Scoring")
103
 
104
+ st.sidebar.markdown("---")
105
+ st.sidebar.markdown("### πŸ”§ Configuration")
106
+ st.sidebar.markdown("Set your API keys:")
107
+ st.sidebar.code("export GROQ_API_KEY='your-key'")
108
+
109
+ st.sidebar.markdown("---")
110
+ st.sidebar.markdown("### πŸ“– GEO Metrics")
111
+ st.sidebar.markdown("**AI Search Visibility**: How likely AI engines will surface your content")
112
+ st.sidebar.markdown("**Query Intent Matching**: How well content matches user queries")
113
+ st.sidebar.markdown("**Conversational Readiness**: Suitability for AI chat responses")
114
+ st.sidebar.markdown("**Citation Worthiness**: Probability of being cited by AI")
115
+
116
+ st.sidebar.markdown("---")
117
+ st.sidebar.markdown("### ℹ️ Components")
118
+ st.sidebar.markdown("- **Parser**: Extract content from various sources")
119
+ st.sidebar.markdown("- **Scorer**: Analyze GEO performance")
120
+ st.sidebar.markdown("- **Optimizer**: Enhance content for AI")
121
+ st.sidebar.markdown("- **Chunker**: Create vector embeddings")
122
+ st.sidebar.markdown("- **Exporter**: Generate reports")
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
+ def render_document_qa_tab(self):
125
+ """Render Document Q&A tab"""
126
+ st.header("πŸ“„ Document Question Answering")
127
+ st.markdown("Upload documents or paste text to ask questions using RAG.")
128
+
129
+ # File upload
130
+ uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
131
+
132
+ # Text input
133
+ pasted_text = st.text_area("Or paste text directly:", height=150)
134
+
135
+ # Question input
136
+ user_query = st.text_input("Ask a question about the content:")
137
+
138
+ # Submit button
139
+ if st.button("πŸ” Ask Question", key="qa_submit"):
140
+ if not user_query.strip():
141
+ st.warning("Please enter a question.")
142
+ return
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  try:
145
+ # Parse content
146
+ documents = []
147
+
148
+ if uploaded_file:
149
+ with st.spinner("Processing PDF..."):
150
+ # Save uploaded file temporarily
151
+ temp_path = self.save_uploaded_file(uploaded_file)
152
+ documents = self.pdf_parser.parse(temp_path)
153
+ os.unlink(temp_path) # Clean up
154
+
155
+ elif pasted_text.strip():
156
+ with st.spinner("Processing text..."):
157
+ documents = self.text_parser.parse(pasted_text)
158
+
159
+ else:
160
+ st.warning("Please upload a PDF or paste some text.")
161
+ return
162
+
163
+ # Create vector store and answer question
164
+ with st.spinner("Creating embeddings and searching..."):
165
+ qa_chain = self.vector_chunker.create_qa_chain(documents, self.llm)
166
+ result = qa_chain({"query": user_query})
167
+
168
+ # Display results
169
  st.markdown("### πŸ’¬ Answer")
170
  st.write(result["result"])
171
+
172
+ # Show sources
173
  with st.expander("πŸ“„ Source Documents"):
174
+ for i, doc in enumerate(result.get("source_documents", [])):
175
  st.write(f"**Source {i+1}:**")
176
+ content = doc.page_content
177
+ st.write(content[:500] + "..." if len(content) > 500 else content)
178
  if hasattr(doc, 'metadata') and doc.metadata:
179
  st.write(f"*Metadata: {doc.metadata}*")
180
  st.write("---")
181
+
182
  except Exception as e:
183
  st.error(f"An error occurred: {str(e)}")
 
 
 
 
 
184
 
185
+ def render_content_enhancement_tab(self):
186
+ """Render Content Enhancement tab"""
187
+ st.header("πŸ”§ Content Enhancement")
188
+ st.markdown("Analyze and optimize your content for better AI/LLM performance.")
189
+
190
+ # Content input
191
+ input_text = st.text_area(
192
+ "Enter content to analyze and enhance:",
193
+ height=200,
194
+ key="enhancement_input"
195
+ )
196
+
197
+ # Analysis options
198
+ col1, col2 = st.columns(2)
199
+ with col1:
200
+ analyze_only = st.checkbox("Analysis only (no rewriting)", value=False)
201
+ with col2:
202
+ include_keywords = st.checkbox("Include keyword suggestions", value=True)
203
+
204
+ # Submit button
205
+ if st.button("πŸ”§ Analyze & Enhance", key="enhancement_submit"):
206
+ if not input_text.strip():
207
+ st.warning("Please enter some content to analyze.")
208
+ return
209
 
 
210
  try:
211
+ with st.spinner("Analyzing content..."):
212
+ # Run content analysis and optimization
213
+ result = self.content_optimizer.optimize_content(
214
+ input_text,
215
+ analyze_only=analyze_only,
216
+ include_keywords=include_keywords
217
+ )
218
+
219
+ if result.get("error"):
220
+ st.error(f"Analysis failed: {result['error']}")
221
+ return
222
 
223
+ # Display results
224
  st.markdown("### πŸ“Š Analysis Results")
225
 
226
+ # Show scores
227
+ scores = result.get("scores", {})
228
+ if scores:
229
+ col1, col2, col3 = st.columns(3)
230
 
231
+ with col1:
232
+ clarity = scores.get("clarity", 0)
233
+ st.metric("Clarity", f"{clarity}/10")
234
+
235
+ with col2:
236
+ structure = scores.get("structuredness", 0)
237
+ st.metric("Structure", f"{structure}/10")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
+ with col3:
240
+ answerability = scores.get("answerability", 0)
241
+ st.metric("Answerability", f"{answerability}/10")
242
+
243
+ # Show keywords
244
+ keywords = result.get("keywords", [])
245
+ if keywords:
246
+ st.markdown("#### πŸ”‘ Key Terms")
247
+ st.write(", ".join(keywords))
248
+
249
+ # Show optimized content
250
+ optimized_text = result.get("optimized_text", "")
251
+ if optimized_text and not analyze_only:
252
+ st.markdown("#### ✨ Optimized Content")
253
+ st.text_area(
254
+ "Enhanced version:",
255
+ value=optimized_text,
256
+ height=200,
257
+ key="optimized_output"
258
+ )
259
+
260
+ # Export option
261
+ if st.button("πŸ“₯ Export Results"):
262
+ export_data = self.result_exporter.export_enhancement_results(result)
263
+ st.download_button(
264
+ label="Download Analysis Report",
265
+ data=json.dumps(export_data, indent=2),
266
+ file_name=f"content_analysis_{int(time.time())}.json",
267
+ mime="application/json"
268
+ )
269
+
270
  except Exception as e:
271
+ st.error(f"An error occurred: {str(e)}")
 
 
 
 
 
 
272
 
273
+ def render_website_analysis_tab(self):
274
+ """Render Website GEO Analysis tab"""
275
+ st.header("🌐 Website GEO Analysis")
276
+ st.markdown("Analyze websites for Generative Engine Optimization (GEO) performance.")
277
 
278
+ # URL input
279
+ col1, col2 = st.columns([3, 1])
280
+
281
+ with col1:
282
+ website_url = st.text_input(
283
+ "Enter website URL:",
284
+ placeholder="https://example.com"
285
+ )
286
+
287
+ with col2:
288
+ max_pages = st.selectbox("Pages to analyze:", [1, 3, 5], index=0)
289
+
290
+ # Analysis options
291
+ col1, col2 = st.columns(2)
292
+ with col1:
293
+ include_subpages = st.checkbox("Include subpages", value=False)
294
+ with col2:
295
+ detailed_analysis = st.checkbox("Detailed analysis", value=True)
296
+
297
+ # Submit button
298
+ if st.button("🌐 Analyze Website", key="website_analyze"):
299
+ if not website_url.strip():
300
+ st.warning("Please enter a website URL.")
301
+ return
302
 
 
303
  try:
304
+ # Normalize URL
305
+ if not website_url.startswith(('http://', 'https://')):
306
+ website_url = 'https://' + website_url
307
 
308
+ with st.spinner(f"Analyzing website: {website_url}"):
309
+ # Parse website content
310
+ pages_data = self.webpage_parser.parse_website(
311
+ website_url,
312
+ max_pages=max_pages,
313
+ include_subpages=include_subpages
314
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
+ if not pages_data:
317
+ st.error("Could not extract content from the website.")
318
+ return
319
 
320
+ st.success(f"Successfully extracted content from {len(pages_data)} page(s)")
321
+
322
+ # Analyze GEO scores
323
+ with st.spinner("Calculating GEO scores..."):
324
+ geo_results = []
 
 
 
 
 
 
 
 
 
 
 
325
 
326
+ for i, page_data in enumerate(pages_data):
327
+ with st.spinner(f"Analyzing page {i+1}/{len(pages_data)}..."):
328
+ analysis = self.geo_scorer.analyze_page_geo(
329
+ page_data['content'],
330
+ page_data['title'],
331
+ detailed=detailed_analysis
332
+ )
 
 
 
 
 
 
333
 
334
+ if not analysis.get('error'):
335
+ analysis['page_data'] = page_data
336
+ geo_results.append(analysis)
337
+ else:
338
+ st.warning(f"Could not analyze page {i+1}: {analysis['error']}")
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
+ if not geo_results:
341
  st.error("Could not analyze any pages from the website.")
342
+ return
343
+
344
+ # Display results
345
+ self.display_geo_results(geo_results, website_url)
346
+
347
+ # Export functionality
348
+ st.markdown("### πŸ“₯ Export Results")
349
+ if st.button("πŸ“Š Generate Full Report"):
350
+ report_data = self.result_exporter.export_geo_results(
351
+ geo_results,
352
+ website_url
353
+ )
354
 
355
+ st.download_button(
356
+ label="Download GEO Report",
357
+ data=json.dumps(report_data, indent=2),
358
+ file_name=f"geo_analysis_{website_url.replace('https://', '').replace('/', '_')}.json",
359
+ mime="application/json"
360
+ )
361
+
362
  except Exception as e:
363
  st.error(f"An error occurred during website analysis: {str(e)}")
 
 
 
 
 
 
 
364
 
365
+ def display_geo_results(self, geo_results: List[Dict], website_url: str):
366
+ """Display GEO analysis results"""
367
+ st.markdown("## πŸ“Š GEO Analysis Results")
368
+
369
+ # Calculate average scores
370
+ avg_scores = self.calculate_average_scores(geo_results)
371
+ overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
372
+
373
+ # Main score display
374
+ col1, col2, col3 = st.columns([1, 2, 1])
375
+ with col2:
376
+ st.metric(
377
+ "Overall GEO Score",
378
+ f"{overall_avg:.1f}/10",
379
+ delta=f"{overall_avg - 7.0:.1f}" if overall_avg != 7.0 else None
380
+ )
381
+
382
+ # Individual metrics
383
+ st.markdown("### πŸ“ˆ Detailed GEO Metrics")
384
+
385
+ # First row of metrics
386
+ col1, col2, col3, col4 = st.columns(4)
387
+ metrics_row1 = [
388
+ ("AI Search Visibility", "ai_search_visibility"),
389
+ ("Query Intent Match", "query_intent_matching"),
390
+ ("Factual Accuracy", "factual_accuracy"),
391
+ ("Conversational Ready", "conversational_readiness")
392
+ ]
393
+
394
+ for i, (display_name, key) in enumerate(metrics_row1):
395
+ with [col1, col2, col3, col4][i]:
396
+ score = avg_scores.get(key, 0)
397
+ st.metric(display_name, f"{score:.1f}")
398
+
399
+ # Second row of metrics
400
+ col1, col2, col3, col4 = st.columns(4)
401
+ metrics_row2 = [
402
+ ("Semantic Richness", "semantic_richness"),
403
+ ("Context Complete", "context_completeness"),
404
+ ("Citation Worthy", "citation_worthiness"),
405
+ ("Multi-Query Cover", "multi_query_coverage")
406
+ ]
407
+
408
+ for i, (display_name, key) in enumerate(metrics_row2):
409
+ with [col1, col2, col3, col4][i]:
410
+ score = avg_scores.get(key, 0)
411
+ st.metric(display_name, f"{score:.1f}")
412
+
413
+ # Recommendations
414
+ self.display_recommendations(geo_results)
415
+
416
+ # Detailed page analysis
417
+ with st.expander("πŸ“‹ Detailed Page Analysis"):
418
+ for i, analysis in enumerate(geo_results):
419
+ page_data = analysis.get('page_data', {})
420
+ st.markdown(f"#### Page {i+1}: {page_data.get('title', 'Unknown Title')}")
421
+ st.write(f"**URL**: {page_data.get('url', 'Unknown')}")
422
+ st.write(f"**Word Count**: {page_data.get('word_count', 0)}")
423
+
424
+ # Show topics and entities if available
425
+ if 'primary_topics' in analysis:
426
+ st.write(f"**Topics**: {', '.join(analysis['primary_topics'])}")
427
+
428
+ if 'entities' in analysis:
429
+ st.write(f"**Entities**: {', '.join(analysis['entities'])}")
430
+
431
+ # Show page-specific scores
432
+ if 'geo_scores' in analysis:
433
+ scores = analysis['geo_scores']
434
+ score_text = ", ".join([f"{k}: {v:.1f}" for k, v in scores.items()])
435
+ st.write(f"**Scores**: {score_text}")
436
+
437
+ st.write("---")
438
 
439
+ def display_recommendations(self, geo_results: List[Dict]):
440
+ """Display optimization recommendations"""
441
+ st.markdown("### πŸ’‘ Optimization Recommendations")
442
+
443
+ # Collect all recommendations
444
+ all_recommendations = []
445
+ all_opportunities = []
446
+
447
+ for analysis in geo_results:
448
+ all_recommendations.extend(analysis.get('recommendations', []))
449
+ all_opportunities.extend(analysis.get('optimization_opportunities', []))
450
+
451
+ # Remove duplicates and display
452
+ unique_recommendations = list(set(all_recommendations))
453
+
454
+ if unique_recommendations:
455
+ for i, rec in enumerate(unique_recommendations[:5], 1):
456
+ st.write(f"**{i}.** {rec}")
457
+
458
+ # Priority opportunities
459
+ if all_opportunities:
460
+ st.markdown("#### πŸš€ Priority Optimizations")
461
+
462
+ high_priority = [opp for opp in all_opportunities if opp.get('priority') == 'high']
463
+ medium_priority = [opp for opp in all_opportunities if opp.get('priority') == 'medium']
464
+
465
+ if high_priority:
466
+ st.markdown("##### πŸ”΄ High Priority")
467
+ for opp in high_priority[:3]:
468
+ st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', 'No description')}")
469
+
470
+ if medium_priority:
471
+ st.markdown("##### 🟑 Medium Priority")
472
+ for opp in medium_priority[:3]:
473
+ st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', 'No description')}")
474
+
475
+ def calculate_average_scores(self, geo_results: List[Dict]) -> Dict[str, float]:
476
+ """Calculate average GEO scores across all pages"""
477
+ if not geo_results:
478
+ return {}
479
+
480
+ # Get all score keys from the first result
481
+ score_keys = list(geo_results[0].get('geo_scores', {}).keys())
482
+ avg_scores = {}
483
+
484
+ for key in score_keys:
485
+ scores = [
486
+ result['geo_scores'][key]
487
+ for result in geo_results
488
+ if 'geo_scores' in result and key in result['geo_scores']
489
+ ]
490
+ avg_scores[key] = sum(scores) / len(scores) if scores else 0
491
+
492
+ return avg_scores
493
+
494
+ def save_uploaded_file(self, uploaded_file) -> str:
495
+ """Save uploaded file to temporary location"""
496
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
497
+ tmp_file.write(uploaded_file.read())
498
+ return tmp_file.name
499
+
500
+
501
+ def main():
502
+ """Main entry point"""
503
+ app = GEOSEOApp()
504
+ app.run()
505
+
506
 
507
+ if __name__ == "__main__":
508
+ main()