Spaces:

NihalGazi
/

Wikipedai

Paused

App Files Files Community

Wikipedai / app.py

NihalGazi

Update app.py

035ca3c verified 4 months ago

raw

history blame contribute delete

12.3 kB

	import re
	import json
	import requests
	import html as html_lib
	import time
	from typing import Optional
	from fastapi import FastAPI
	from fastapi.responses import HTMLResponse, PlainTextResponse
	import random

	app = FastAPI()

	POLLINATIONS_URL = "https://text.pollinations.ai/prompt/"

	# ---- Prompt templates ----

	# STEP 1: Get the article structure
	HEADLINES_PROMPT = """
	You are an AI that produces a table of contents, for a neutral, encyclopedic Wikipedia-style article.
	Write about the topic: "{topic}".
	Output ONLY valid JSON and NOTHING else. Do not add explanatory text, headers, markdown or code fences.
	Format exactly:
	{{
	"title": "string",
	"lead": "string",
	"sections": [
	{{
	"heading": "string",
	"subsections": [
	{{
	"subheading": "string"
	}}
	]
	}}
	],
	"last_edited": "string" /* optional */
	}}
	"""

	# STEP 2: Get all content for the structure in a single call
	ARTICLE_PROMPT = """
	You are an AI that writes a complete, neutral, and detailed encyclopedic Wikipedia-style article.
	The topic is "{topic}".
	You have been given a JSON structure containing headings and subheadings. Your task is to write the content for this structure.

	Instructions:
	1. Content Depth: Write a detailed paragraph for each heading and subheading. Paragraphs for the main headings should be especially comprehensive, consisting of several sentences to provide a thorough overview of the section's topic.
	2. Structure: Do not invent new sections. Stick strictly to the provided input structure.
	3. Output Format: Output ONLY a valid JSON object and NOTHING else. The output JSON must have the exact same structure as the input, but with a "content" field added to each section and subsection.

	Input Structure:
	{structure_json}

	Output Format Example:
	{{
	"sections": [
	{{
	"heading": "History",
	"content": "The history of the topic is long and varied, with early concepts dating back to ancient philosophy. Key developments in the 20th century, particularly the work on [[Turing Machines]], laid the groundwork for the modern field.",
	"subsections": [
	{{
	"subheading": "Early developments",
	"content": "In the early days, developments were slow and often theoretical..."
	}}
	]
	}}
	]
	}}
	"""

	# ---- In-memory raw log (topic -> list of (prompt, response)) ----
	RAW_LOG = {}

	# ---- Utility functions ----
	def call_pollinations(prompt: str) -> str:
	"""Call Pollinations and return the raw text response (no stripping)."""
	uri = POLLINATIONS_URL + requests.utils.requote_uri(prompt) + "?token=ZJyDM8G0LiZnNxFf&model=gemini&json=true&seed="+str(random.randint(0,999999))
	r = requests.get(uri, timeout=60) # Increased timeout for larger generation
	r.raise_for_status()
	return r.text # preserve raw

	def extract_json(text: str) -> dict:
	"""Extract and parse the first JSON object found between first '{' and last '}'."""
	start = text.find("{")
	end = text.rfind("}") + 1
	if start == -1 or end == -1 or start >= end:
	raise ValueError("No JSON object found in AI response.\n\nRaw (truncated):\n" + text[:2000])
	json_str = text[start:end]
	try:
	return json.loads(json_str)
	except Exception as e:
	raise ValueError(f"Failed to parse JSON: {e}\n\nExtracted (truncated):\n{json_str[:2000]}\n\nRaw (truncated):\n{text[:2000]}")

	def log_raw(topic: str, prompt: str, response: str):
	"""Append a prompt/response pair to RAW_LOG for the topic."""
	RAW_LOG.setdefault(topic, []).append((prompt, response))

	# ---- Two-step generation functions ----
	def generate_headlines(topic: str) -> dict:
	"""Step 1: Get the article structure (TOC)."""
	prompt = HEADLINES_PROMPT.format(topic=topic)
	resp = call_pollinations(prompt)
	log_raw(topic, prompt, resp)
	data = extract_json(resp)
	# Normalize structure
	data.setdefault("title", topic.replace("_", " "))
	data.setdefault("lead", data.get("lead", ""))
	data.setdefault("sections", data.get("sections", []))
	return data

	def generate_article_content(topic: str, toc_structure: dict) -> dict:
	"""Step 2: Generate all content for the given structure in one call."""
	# Create a clean version of the structure for the prompt
	structure_for_prompt = {
	"sections": [
	{
	"heading": s.get("heading"),
	"subsections": s.get("subsections", [])
	} for s in toc_structure.get("sections", [])
	]
	}
	structure_json = json.dumps(structure_for_prompt, indent=2)

	prompt = ARTICLE_PROMPT.format(topic=topic, structure_json=structure_json)
	resp = call_pollinations(prompt)
	log_raw(topic, prompt, resp)
	data = extract_json(resp)
	return data

	# ---- Renderer ----
	def esc(s): return html_lib.escape(s) if isinstance(s, str) else ""

	def render_page(article: dict, execution_time: Optional[float] = None) -> str:
	"""Render final HTML page from the fully-populated article JSON."""
	title = esc(article.get("title", "Untitled"))
	lead = esc(article.get("lead", ""))

	css = """body{font-family:sans-serif;margin:0;background:#f6f6f7;color:#202122}#container{display:flex;min-height:100vh}#left-sidebar{width:18%;padding:1.2em;background:#f6f6f7;border-right:1px solid #a7d7f9;box-sizing:border-box}#main-content{width:82%;padding:1.6em;background:#fff;box-sizing:border-box}header{display:flex;justify-content:space-between;align-items:center;border-bottom:1px solid #a7d7f9;padding-bottom:.6em;margin-bottom:1em}#main-title{font-family:Georgia,serif;font-size:2em;margin:0 0 .2em 0;font-weight:normal}.site-sub{color:#54595d;margin-top:0;font-size:.95em}h2{font-size:1.3em;margin-top:1.2em;border-bottom:1px solid #a2a9b1;padding-bottom:.2em;font-weight:normal}h3{font-size:1.05em;margin-top:.8em}p{line-height:1.6}#toc{background:#f8f9fa;border:1px solid #a2a9b1;padding:1em;margin-bottom:1em;display:inline-block}footer{margin-top:2em;border-top:1px solid #a2a9b1;padding-top:1em;color:#54595d;font-size:.85em}.references ol{padding-left:1.2em}"""

	parts = [
	"<!doctype html><html lang='en'><head><meta charset='utf-8'>",
	f"<title>{title} - Wikipedai</title>",
	"<link rel='icon' href='https://huggingface.co/spaces/NihalGazi/Wikipedai/resolve/main/wikipedai.png'>",
	f"<style>{css}</style></head><body><div id='container'><div id='left-sidebar'>",
	"<div style='text-align:center;margin-bottom:1em;'><a href='/'><img src='https://huggingface.co/spaces/NihalGazi/Wikipedai/resolve/main/wikipedai_logo.png' alt='logo' style='width:90px'></a></div>",
	"<div style='margin-bottom:1em;'><strong>Main menu</strong><ul style='padding-left:1em;'><li><a href='#'>Main page</a></li><li><a href='#'>Contents</a></li><li><a href='#'>Random article</a></li></ul></div></div>",
	"<div id='main-content'><header><div><a href='#'>Article</a> • <a href='#'>Talk</a></div><div><input placeholder='Search' id='search_bar' style='padding:.4em;border:1px solid #a2a9b1'></div></header>",
	f"<main><h1 id='main-title'>{title}</h1><p class='site-sub'>From Wikipedai, the free encyclopedai</p>",
	]

	if lead: parts.append(f"<p><strong>{lead}</strong></p>")

	if article.get("sections"):
	parts.append("<div id='toc'><h2>Contents</h2><ul>")
	for i, sec in enumerate(article.get("sections", []), 1):
	parts.append(f"<li><a href='#sec{i}'>{i}. {esc(sec.get('heading',''))}</a></li>")
	if sec.get("subsections"):
	parts.append("<ul>")
	for j, sub in enumerate(sec.get("subsections", []), 1):
	parts.append(f"<li><a href='#sec{i}_sub{j}'>{i}.{j} {esc(sub.get('subheading',''))}</a></li>")
	parts.append("</ul>")
	parts.append("</ul></div>")



	for i, sec in enumerate(article.get("sections", []), 1):
	parts.append(f"<h2 id='sec{i}'><span class='mw-headline'>{esc(sec.get('heading',''))}</span></h2>")
	if sec.get("content"): parts.append(f"<p>{esc(sec.get('content',''))}</p>")
	for j, sub in enumerate(sec.get("subsections", []) or [], 1):
	parts.append(f"<h3 id='sec{i}_sub{j}'><span class='mw-headline'>{esc(sub.get('subheading',''))}</span></h3>")
	if sub.get("content"): parts.append(f"<p>{esc(sub.get('content',''))}</p>")

	footer_parts = []
	if article.get("last_edited"): footer_parts.append(f"This page was last edited on {esc(article.get('last_edited', ''))}")
	if execution_time is not None: footer_parts.append(f"Page generated in {execution_time:.2f} seconds")
	footer_content = " • ".join(footer_parts)

	parts.append(f"</main><footer>{footer_content}</footer></div></div></body></html>")

	js = """
	<script>
	document.getElementById('search_bar').addEventListener('keydown', function(event) {
	// Check if the key pressed was 'Enter'
	if (event.key === 'Enter') {
	// Prevent any default action
	event.preventDefault();

	// Get the user's query from the input field
	const query = document.getElementById('search_bar').value;

	// If the query is empty, do nothing
	if (!query) {
	return;
	}

	// URI-encode the query to handle special characters safely
	const encodedQuery = encodeURIComponent(query);

	// Construct the final URL for the API
	const apiUrl = `https://nihalgazi-wikipedai.hf.space/wikipedai/${encodedQuery}`;

	// Redirect the browser to the API URL
	window.location.href = apiUrl;
	}
	});
	</script>
	"""

	parts.append(js)
	return "\n".join(parts)

	# ---- API Routes ----

	@app.get("/wikipedai/{topic}", response_class=HTMLResponse)
	def wikipedai(topic: str):
	start_time = time.time()
	RAW_LOG[topic] = []

	try:
	# Step 1: Get the article structure (title, lead, headings)
	article_structure = generate_headlines(topic)

	# Step 2: Get all content for that structure in a single API call
	article_content = generate_article_content(topic, article_structure)

	# Step 3: Merge the content back into the original structure
	# This assumes the AI returned the sections in the same order, which it should.
	content_sections = article_content.get("sections", [])
	for i, section_structure in enumerate(article_structure.get("sections", [])):
	if i < len(content_sections):
	# Add content to the main section
	section_structure["content"] = content_sections[i].get("content", "[Content not generated]")

	# Add content to subsections
	content_subsections = content_sections[i].get("subsections", [])
	for j, sub_structure in enumerate(section_structure.get("subsections", [])):
	if j < len(content_subsections):
	sub_structure["content"] = content_subsections[j].get("content", "[Content not generated]")

	# Final render
	elapsed_time = time.time() - start_time
	html = render_page(article_structure, execution_time=elapsed_time)
	return HTMLResponse(content=html, status_code=200)

	except Exception as e:
	# Capture the full traceback for better debugging
	import traceback
	error_details = f"Error: {e}\n\nTraceback:\n{traceback.format_exc()}"
	return HTMLResponse(content=f"<h1>Error</h1><pre>{html_lib.escape(error_details)}</pre>", status_code=500)

	@app.get("/raw/{topic}", response_class=PlainTextResponse)
	def raw(topic: str):
	entries = RAW_LOG.get(topic, [])
	if not entries:
	return PlainTextResponse(f"No raw log found for topic '{topic}'. Try calling /wikipedai/{topic} first.", status_code=404)

	out_lines = []
	for idx, (prompt, resp) in enumerate(entries, start=1):
	out_lines.append(f"--- Input [{idx}] ---\n{prompt}\n\n--- AI response [{idx}] ---\n{resp}\n")
	return PlainTextResponse("\n".join(out_lines), status_code=200)