Spaces:

CultriX
/

RAG-Scraper

Sleeping

App Files Files Community

RAG-Scraper / app.py

CultriX

feat: Overhaul WebUI, add PDF/Text export, use Poetry in Docker

17f9299 8 months ago

raw

history blame

13.7 kB

	from __future__ import annotations
	import os
	os.environ['HF_HOME'] = '/tmp/hf_cache'
	os.makedirs(os.environ['HF_HOME'], exist_ok=True) # Ensure the directory exists
	import gradio as gr
	import subprocess
	import os
	import re
	import tempfile
	import json
	import csv
	# Removed: from typing import Iterable # Added for Theme
	from rag_scraper.scraper import Scraper
	from rag_scraper.converter import Converter
	from rag_scraper.link_extractor import LinkExtractor, LinkType
	from rag_scraper.utils import URLUtils
	# Removed: from gradio.themes.base import Base # Added for Theme
	# Removed: from gradio.themes.utils import colors, fonts, sizes # Added for Theme
	import markdown_pdf # Added for PDF conversion

	# --- Custom Theme Definition --- (REMOVED Seafoam class and instance)

	def is_github_repo(url_or_id):
	"""Check if the input is a GitHub repository URL or ID."""
	if "github.com" in url_or_id:
	return True
	if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
	return True
	return False

	def check_repomix_installed():
	"""Check if Repomix is installed."""
	try:
	result = subprocess.run(["repomix", "--version"],
	capture_output=True, text=True, check=False)
	return result.returncode == 0
	except Exception:
	return False

	def run_repomix(repo_url_or_id, progress=gr.Progress(track_tqdm=True)):
	"""Run Repomix on the GitHub repository and return the content."""
	progress(0, desc="Starting Repomix processing...")
	try:
	with tempfile.TemporaryDirectory() as temp_dir:
	output_file_name = "repomix-output.md"
	output_file_path = os.path.join(temp_dir, output_file_name)

	if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
	repo_url = f"https://github.com/{repo_url_or_id}"
	else:
	repo_url = repo_url_or_id

	progress(0.2, desc=f"Running Repomix on {repo_url}...")
	cmd = [
	"repomix",
	"--remote", repo_url,
	"--output", output_file_path,
	"--style", "markdown",
	"--compress"
	]

	process = subprocess.run(cmd, capture_output=True, text=True, check=False, encoding='utf-8')
	progress(0.8, desc="Repomix command executed.")

	if process.returncode != 0:
	error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
	return f"Error running Repomix:\n{error_details}", None

	if os.path.exists(output_file_path):
	with open(output_file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	progress(1, desc="Repomix output processed.")
	return content, output_file_path
	else:
	error_details = f"Return Code: {process.returncode}\nStderr: {process.stderr}\nStdout: {process.stdout}"
	return f"Error: Repomix did not generate an output file at '{output_file_path}'.\nRepomix Output:\n{error_details}", None

	except Exception as e:
	progress(1, desc="Error during Repomix processing.")
	return f"Error processing GitHub repository: {str(e)}", None

	def scrape_and_convert_website(url, depth, progress=gr.Progress(track_tqdm=True)):
	"""Fetch HTML, extract links, convert to Markdown."""
	progress(0, desc=f"Starting web scrape for {url}...")
	visited_urls = set()
	all_markdown_content = ""

	def recursive_scrape(current_url, current_depth, total_links_estimate=1, link_index=0):
	if current_url in visited_urls or current_depth < 0:
	return ""

	visited_urls.add(current_url)

	try:
	progress_val = link_index / total_links_estimate if total_links_estimate > 0 else 0
	progress(progress_val, desc=f"Scraping: {current_url} (Depth: {depth - current_depth})")
	html_content = Scraper.fetch_html(current_url)
	except Exception as e:
	return f"Error fetching {current_url}: {str(e)}\n"

	markdown_content = f"## Extracted from: {current_url}\n\n"
	markdown_content += Converter.html_to_markdown(
	html=html_content,
	base_url=current_url,
	parser_features='html.parser',
	ignore_links=True
	)

	page_content = markdown_content + "\n\n"

	if current_depth > 0:
	try:
	links = LinkExtractor.scrape_url(current_url, link_type=LinkType.INTERNAL)
	valid_links = [
	link for link in links
	if URLUtils.is_internal(link, current_url) and link not in visited_urls
	]

	num_links = len(valid_links)
	for i, link_url in enumerate(valid_links):
	page_content += recursive_scrape(link_url, current_depth - 1, num_links, i)
	except Exception as e:
	page_content += f"Error extracting links from {current_url}: {str(e)}\n"
	return page_content

	all_markdown_content = recursive_scrape(url, depth)
	progress(1, desc="Web scraping complete.")

	with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp_file:
	tmp_file.write(all_markdown_content)
	return all_markdown_content, tmp_file.name

	def convert_to_json(markdown_content, source_url_or_id):
	data = {"source": source_url_or_id, "content": markdown_content}
	return json.dumps(data, indent=2)

	def convert_to_csv(markdown_content, source_url_or_id):
	output = tempfile.NamedTemporaryFile(mode='w+', delete=False, newline='', suffix=".csv", encoding="utf-8")
	writer = csv.writer(output)
	writer.writerow(["source", "content"])
	writer.writerow([source_url_or_id, markdown_content])
	output.close()
	return output.name

	def save_output_to_file(content, output_format, source_url_or_id):
	"""Saves content to a temporary file based on format and returns its path."""
	processed_content = content # Default for Markdown and Text

	if output_format == "JSON":
	suffix = ".json"
	processed_content = convert_to_json(content, source_url_or_id)
	elif output_format == "CSV":
	# convert_to_csv returns a path directly
	return convert_to_csv(content, source_url_or_id)
	elif output_format == "Text":
	suffix = ".txt"
	elif output_format == "PDF":
	suffix = ".pdf"
	# PDF conversion happens differently, creates file directly
	pdf_output_path = ""
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf_file:
	pdf_output_path = tmp_pdf_file.name

	md_pdf = markdown_pdf.MarkdownPdf(toc_level=2)
	md_pdf.convert_from_string(content, pdf_output_path)
	return pdf_output_path
	except Exception as e:
	print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
	suffix = ".pdf.md"
	# No processed_content change needed, it's already markdown
	else: # Default to Markdown
	suffix = ".md"

	with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp_file:
	tmp_file.write(processed_content)
	return tmp_file.name

	def process_input_updated(url_or_id, source_type, depth, output_format_selection, progress=gr.Progress(track_tqdm=True)):
	progress(0, desc="Initializing...")
	raw_content = ""
	error_message = ""
	output_file_path = None

	if source_type == "GitHub Repository":
	if not check_repomix_installed():
	error_message = "Repomix is not installed or not accessible. Please ensure it's installed globally."
	return error_message, None, None
	raw_content, _ = run_repomix(url_or_id, progress=progress)
	if "Error" in raw_content:
	error_message = raw_content
	raw_content = ""
	elif source_type == "Webpage":
	raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
	if "Error" in raw_content:
	error_message = raw_content
	raw_content = ""
	else:
	error_message = "Invalid source type selected."
	return error_message, None, None

	if error_message:
	return error_message, None, None

	try:
	progress(0.9, desc=f"Converting to {output_format_selection}...")
	output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)

	preview_content = raw_content
	if output_format_selection == "JSON":
	preview_content = convert_to_json(raw_content, url_or_id)
	elif output_format_selection == "CSV" and output_file_path:
	try:
	with open(output_file_path, 'r', encoding='utf-8') as f_csv:
	csv_preview_lines = [next(f_csv) for _ in range(5)]
	preview_content = "".join(csv_preview_lines)
	if not preview_content: preview_content = "[CSV content is empty or very short]"
	except StopIteration:
	with open(output_file_path, 'r', encoding='utf-8') as f_csv:
	preview_content = f_csv.read()
	if not preview_content: preview_content = "[CSV content is empty]"
	except Exception as e_csv_preview:
	preview_content = f"[Error reading CSV for preview: {str(e_csv_preview)}]"
	elif output_format_selection == "CSV" and not output_file_path:
	preview_content = "[CSV file path not available for preview]"
	elif output_format_selection == "PDF":
	preview_content = f"[PDF generated. Download to view: {os.path.basename(output_file_path if output_file_path else 'file.pdf')}]"
	if "Saving as Markdown instead" in (output_file_path or ""):
	preview_content = raw_content + f"\n\n[Note: PDF conversion failed, showing Markdown. File saved as .pdf.md]"

	progress(1, desc="Processing complete.")
	return f"Successfully processed: {url_or_id}", preview_content, output_file_path
	except Exception as e:
	return f"Error during file conversion/saving: {str(e)}", raw_content, None

	with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
	gr.Markdown("# RAG-Ready Content Scraper")
	gr.Markdown(
	"Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
	)

	with gr.Row():
	with gr.Column(scale=2):
	url_input = gr.Textbox(
	label="Enter URL or GitHub Repository ID",
	placeholder="e.g., https://example.com OR username/repo"
	)
	source_type_input = gr.Radio(
	choices=["Webpage", "GitHub Repository"],
	value="Webpage",
	label="Select Source Type"
	)
	depth_input = gr.Slider(
	minimum=0, maximum=3, step=1, value=0,
	label="Scraping Depth (for Webpages)",
	info="0: Only main page. Ignored for GitHub repos."
	)
	output_format_input = gr.Dropdown(
	choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
	value="Markdown",
	label="Select Output Format"
	)
	submit_button = gr.Button("Process Content", variant="primary")

	with gr.Column(scale=3):
	status_output = gr.Textbox(label="Status", interactive=False)
	preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
	file_download_output = gr.File(label="Download Processed File", interactive=False)

	gr.Examples(
	examples=[
	["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
	["gradio-app/gradio", "GitHub Repository", 0, "Text"],
	["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
	],
	inputs=[url_input, source_type_input, depth_input, output_format_input],
	outputs=[status_output, preview_output, file_download_output],
	fn=process_input_updated,
	cache_examples=False
	)

	with gr.Accordion("How it Works & More Info", open=False):
	gr.Markdown(
	"""
	Webpage Scraping:
	1. Enter a full URL (e.g., `https://example.com`).
	2. Select "Webpage" as the source type.
	3. Set the desired scraping depth.
	4. Choose your output format.

	GitHub Repository Processing:
	1. Enter a GitHub repository URL or ID (e.g., `username/repo`).
	2. Select "GitHub Repository". (Depth is ignored).
	3. Choose your output format. Uses RepoMix.

	Output Formats: Markdown, JSON, CSV, Text, PDF.

	Note: PDF generation requires `markdown-pdf` library.
	This app is designed for Docker/HuggingFace Spaces.

	[View Source Code on HuggingFace Spaces](https://huggingface.co/spaces/CultriX/RAG-Scraper)
	"""
	)

	submit_button.click(
	fn=process_input_updated,
	inputs=[url_input, source_type_input, depth_input, output_format_input],
	outputs=[status_output, preview_output, file_download_output],
	)

	if __name__ == "__main__":
	iface.launch()