Spaces:

Dragneel
/

untold

Build error

App Files Files Community

untold / app.py

Dragneel

Upload folder using huggingface_hub

b0920a5 verified 12 months ago

raw

history blame contribute delete

21.3 kB

	import gradio as gr
	import json
	import time
	import requests
	from groq import Groq
	import google.generativeai as genai
	from datetime import datetime, timedelta
	import pytz
	from playwright.async_api import async_playwright
	import asyncio
	import random
	from fake_useragent import UserAgent
	from urllib.parse import urlparse, urljoin
	from tenacity import retry, stop_after_attempt, wait_exponential
	import os
	import subprocess

	# Install Playwright and browsers
	# subprocess.run(["playwright", "install"], check=True)
	# subprocess.run(["playwright", "install-deps"], check=True)

	# os.system("apt-get update")
	os.system(
	"apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 "
	"libxcomposite1 libxdamage1 libatspi2.0-0 libxrandr2 libgbm1 libpango-1.0-0 "
	"libasound2 libxshmfence1 libwayland-server0 libwayland-client0 "
	"libgdk-pixbuf2.0-0"
	)


	# Install Playwright browsers
	# subprocess.run(["sudo","playwright", "install"], check=True)

	# Install Playwright dependencies
	# subprocess.run(["sudo","playwright", "install-deps"], check=True)

	# print("Playwright and its dependencies have been installed successfully!")


	# Constants
	CHAPTERS_FILE = 'scraped_chapters.json'
	SPLIT_CHAPTERS_FILE = 'split_scraped_chapters.json'
	TRANSLATIONS_FILE = 'chapter_translated.json'
	GLOSSARY_FILE = 'chapter_glossary.json'

	ua = UserAgent()

	# Function to scrape chapters from xbanxia
	async def scrape_xbanxia(first_chapter_url, final_url=None):
	async with async_playwright() as p:
	browser = await p.chromium.launch()
	context = await browser.new_context(user_agent=ua.random)
	page = await context.new_page()
	try:
	page = await fetch_page(page, first_chapter_url)
	chapters = []
	next_url = first_chapter_url
	chapter_count = 0

	while next_url and (not final_url or next_url != final_url):
	try:
	if chapter_count % 5 == 0:
	await context.set_extra_http_headers({"User-Agent": ua.random})

	page = await fetch_page(page, next_url)

	# Wait for content to load
	await page.wait_for_selector('#nr_title', state='visible', timeout=60000)
	await page.wait_for_selector('#nr1', state='visible', timeout=60000)

	# Extract title
	title_element = await page.query_selector('#nr_title')
	title = await title_element.inner_text() if title_element else None

	# Extract content
	content_element = await page.query_selector('#nr1')
	content = await content_element.inner_text() if content_element else None

	# Extract next URL
	next_link = await page.query_selector('.nav2 .next a')
	next_url = await next_link.get_attribute('href') if next_link else None

	if next_url and not next_url.startswith('http'):
	base_url = '/'.join(first_chapter_url.split('/')[:3])
	next_url = base_url + next_url

	if title and content:
	# Clean up the content
	content_lines = content.split('\n')
	clean_content = '\n'.join(line.strip() for line in content_lines
	if line.strip() and not line.strip().startswith('第'))

	chapters.append({
	'title': title.strip(),
	'content': clean_content,
	'url': next_url
	})

	print(f"Scraped chapter {chapter_count + 1}: {title}")
	chapter_count += 1

	# Random delay between requests
	await asyncio.sleep(random.uniform(2, 5))

	except Exception as e:
	print(f"Error scraping chapter at {next_url}: {str(e)}")
	await asyncio.sleep(60) # Wait for 1 minute before retrying

	await browser.close()
	return chapters

	except Exception as e:
	print(f"An error occurred during scraping: {str(e)}")
	await browser.close()
	return None

	# Function to scrape chapters from 69shuba.cx
	async def scrape_69shu(first_chapter_url, final_url=None):
	async with async_playwright() as p:
	browser = await p.chromium.launch()
	context = await browser.new_context(user_agent=ua.random)
	page = await context.new_page()

	try:
	# Navigate to the first chapter
	page = await fetch_page(page, first_chapter_url)

	chapters = []
	next_url = first_chapter_url
	chapter_count = 0

	while next_url and (not final_url or next_url != final_url):
	try:
	# Change user agent every 5 chapters
	if chapter_count % 5 == 0:
	await context.set_extra_http_headers({"User-Agent": ua.random})

	page = await fetch_page(page, next_url)
	await page.wait_for_selector('.txtnav', state='visible', timeout=60000)

	# Extract title
	title_element = await page.query_selector('.txtnav h1')
	title = await title_element.inner_text() if title_element else None

	if not title:
	title_element = await page.query_selector('.txtnav')
	if title_element:
	title_text = await title_element.inner_text()
	title = title_text.split('\n')[0].strip()

	# Extract content
	content_element = await page.query_selector('.txtnav')
	content = await content_element.inner_text() if content_element else None

	# Extract next URL
	next_link = await page.query_selector('.page1 a:nth-child(4)')
	next_url = await next_link.get_attribute('href') if next_link else None

	if title and content:
	# Clean up the content
	content_lines = content.split('\n')
	clean_content = '\n'.join(line.strip() for line in content_lines if line.strip() and not line.strip().startswith('Chapter'))

	chapters.append({
	'title': title,
	'content': clean_content,
	'url': next_url
	})

	print(f"Scraped chapter {chapter_count + 1}: {title}")
	chapter_count += 1

	# Add a random delay between requests
	await asyncio.sleep(random.uniform(2, 5))

	except Exception as e:
	print(f"Error scraping chapter at {next_url}: {str(e)}")
	await asyncio.sleep(60) # Wait for 1 minute before trying the next chapter

	await browser.close()
	return chapters

	except Exception as e:
	print(f"An error occurred during scraping: {str(e)}")
	await browser.close()
	return None

	# Function to fetch a page
	@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
	async def fetch_page(page, url):
	await page.goto(url)
	await page.wait_for_load_state('networkidle')
	return page

	# Function to scrape chapters based on the domain
	async def scrape_task(first_chapter_url, final_url=None):
	"""Scrape chapters and save to JSON file."""
	domain = urlparse(first_chapter_url).netloc
	if 'xbanxia' in domain:
	result = await scrape_xbanxia(first_chapter_url, final_url)
	elif '69shuba.cx' in domain:
	result = await scrape_69shu(first_chapter_url, final_url)
	else:
	print(f"Unsupported domain: {domain}")
	return
	if result:
	with open(CHAPTERS_FILE, 'w', encoding='utf-8') as f:
	json.dump(result, f, ensure_ascii=False, indent=2)
	print(f'Scraping completed. Data saved to {CHAPTERS_FILE}')
	else:
	print('Scraping failed or was interrupted.')

	# Function to split long chapters
	def split_long_chapter(title, content, max_length=2000):
	"""
	Split a long chapter into multiple parts while preserving paragraph and sentence integrity.
	Splits occur at newline (\n) or sentence-ending symbol (。).
	"""
	# Count only Chinese characters for length check
	chinese_char_count = sum(1 for char in content if '\u4e00' <= char <= '\u9fff')

	if chinese_char_count <= max_length:
	return [{"title": title, "content": content}]

	parts = []
	current_part = []
	current_chinese_count = 0
	part_number = 1

	# First split by paragraphs (newlines)
	paragraphs = content.split('\n')

	for paragraph in paragraphs:
	if not paragraph.strip():
	continue

	# Split paragraph into sentences
	sentences = paragraph.split('。')
	sentences = [s.strip() + '。' for s in sentences if s.strip()]

	for sentence in sentences:
	sentence_chinese_count = sum(1 for char in sentence if '\u4e00' <= char <= '\u9fff')

	# If adding this sentence would exceed the limit
	if current_chinese_count + sentence_chinese_count > max_length and current_part:
	# Save current part
	part_content = '\n'.join(current_part)
	parts.append({
	"title": f"{title} Part {part_number}",
	"content": part_content
	})
	# Start new part
	current_part = [sentence]
	current_chinese_count = sentence_chinese_count
	part_number += 1
	else:
	current_part.append(sentence)
	current_chinese_count += sentence_chinese_count

	# Save the last part if there's anything remaining
	if current_part:
	part_content = '\n'.join(current_part)
	parts.append({
	"title": f"{title} Part {part_number}",
	"content": part_content
	})

	return parts

	# Function to process chapters
	def process_chapters(input_file, output_file, max_length=5000):
	"""
	Process chapters from an input JSON file, splitting long chapters if necessary,
	and save the result to an output JSON file.
	"""
	try:
	# Check if the input file exists
	if not os.path.exists(input_file):
	raise FileNotFoundError(f"Input file '{input_file}' not found. Please ensure the scraping process runs first.")

	with open(input_file, 'r', encoding='utf-8') as f:
	chapters = json.load(f)

	processed_chapters = []
	for chapter in chapters:
	title = chapter['title']
	content = chapter['content']

	split_chapters = split_long_chapter(title, content, max_length)
	processed_chapters.extend(split_chapters)

	with open(output_file, 'w', encoding='utf-8') as f:
	json.dump(processed_chapters, f, ensure_ascii=False, indent=2)

	return len(processed_chapters)
	except Exception as e:
	print(f"Error processing chapters: {str(e)}")
	raise

	def create_glossary(gemini_api_key, groq_api_key=None):
	"""Create a glossary from random chapters using Groq or Gemini API."""
	with open(SPLIT_CHAPTERS_FILE, 'r', encoding='utf-8') as f:
	book_data = json.load(f) # book_data is a list of chapters

	# Select 20 random chapters from the first 100 chapters
	random_chapters = random.sample(book_data, min(2, len(book_data)))

	preliminary_glossary = []

	for i, chapter in enumerate(random_chapters):
	max_retries = 3
	retry_count = 0

	while retry_count < max_retries:
	try:
	prompt = f"""Analyze the following Chinese web novel chapter and create a glossary of 5 important terms or names. Each entry should include the Chinese term and its English equivalent or explanation. Translate character names, locations names, unique concepts, cultivation levels, power levels, power techniques, or culturally specific terms to English.
	The target audience are people from USA that don't know much about Chinese language and culture.
	Very important Note: Only Use Pinyin for Character's Name.

	Chinese chapter:
	{chapter['content']}


	Create a glossary of 5 terms in the following format:
	Chinese Term: English Equivalent
	for example: 朱士久 : Zhu Shijiu

	"""

	if groq_api_key:
	# Use Groq API if the key is provided
	client = Groq(api_key=groq_api_key)
	chat_completion = client.chat.completions.create(
	messages=[{"role": "user", "content": prompt}],
	model="llama3-70b-8192",
	timeout=30
	)
	chapter_glossary = chat_completion.choices[0].message.content
	else:
	# Fallback to Gemini API if Groq key is not provided
	genai.configure(api_key=gemini_api_key)
	gemini_model = genai.GenerativeModel(
	model_name="gemini-1.5-flash",
	generation_config={
	"temperature": 1,
	"top_p": 0.95,
	"top_k": 64,
	"max_output_tokens": 8192,
	}
	)
	gemini_response = gemini_model.generate_content(prompt)
	chapter_glossary = gemini_response.text

	preliminary_glossary.extend(chapter_glossary.split('\n'))
	print(f"Created glossary entries for chapter: {chapter['title']}")
	break
	except Exception as e:
	retry_count += 1
	if retry_count < max_retries:
	print(f"Error processing chapter {chapter['title']}: {str(e)}")
	print(f"Retrying in 60 seconds... (Attempt {retry_count + 1} of {max_retries})")
	time.sleep(60)
	else:
	print(f"Failed to process chapter {chapter['title']} after {max_retries} attempts: {str(e)}")

	time.sleep(5)

	# Refine the glossary
	refine_prompt = """Refine the following glossary for a Chinese web novel. Remove duplicates, redundant entries, and irrelevant words. Ensure consistency in naming and explanations.
	Provide the output in JSON Format.
	Preliminary Glossary:
	{}

	Retain people's names in Pinyin format (e.g., Chen Jingle), but fully translate all other terms, phrases, and concepts into English. Avoid using Pinyin for non-name elements to ensure clarity and natural flow for English readers.
	Provide the refined glossary in the following format:
	Chinese Characters: English Equivalent (No Explanations)
	for example: 朱士久 : Zhu Shijiu
	白家 : Bai Family
	成长系统: Growth System ( not "Chengzhang Xitong)
	""".format('\n'.join(preliminary_glossary))

	try:
	if groq_api_key:
	# Use Groq API for refinement if the key is provided
	client = Groq(api_key=groq_api_key)
	chat_completion = client.chat.completions.create(
	messages=[{"role": "user", "content": refine_prompt}],
	model="llama3-70b-8192",
	timeout=60
	)
	refined_glossary = chat_completion.choices[0].message.content
	else:
	# Fallback to Gemini API for refinement
	gemini_response = gemini_model.generate_content(refine_prompt)
	refined_glossary = gemini_response.text

	# Save the refined glossary
	with open(GLOSSARY_FILE, 'w', encoding='utf-8') as f:
	json.dump(refined_glossary.split('\n'), f, ensure_ascii=False, indent=2)
	print(f'Glossary creation completed. Glossary saved to {GLOSSARY_FILE}')

	except Exception as e:
	print(f"Error refining glossary: {str(e)}")
	raise

	# Function to translate chapters
	def translate_task(gemini_api_key, groq_api_key):
	# Configure Gemini
	genai.configure(api_key=gemini_api_key)
	gemini_model = genai.GenerativeModel(
	model_name="gemini-1.5-flash",
	generation_config={
	"temperature": 1,
	"top_p": 0.95,
	"top_k": 64,
	"max_output_tokens": 8192,
	}
	)

	# Load data and configuration
	with open(SPLIT_CHAPTERS_FILE, 'r', encoding='utf-8') as f:
	book_data = json.load(f)
	with open(GLOSSARY_FILE, 'r', encoding='utf-8') as f:
	glossary = json.load(f)
	formatted_glossary = "\n".join(glossary)

	# Configure Groq
	groq_client = Groq(api_key=groq_api_key) if groq_api_key else None

	translations = []

	for i, chapter in enumerate(book_data):
	prompt = f"""Translate the following Chinese web novel chapter to English. Maintain the original tone and style of the novel. Preserve any cultural references or idioms, providing brief explanations in parentheses if necessary.
	If Paragraphs are stuck together, split them. Retain people's names in Pinyin format (e.g., Chen Jingle), but fully translate all other terms, phrases, and concepts into English. Avoid using Pinyin for non-name elements to ensure clarity and natural flow for English readers.
	You should translate every chinese character to English. The chapter should be fully translated.
	Glossary:
	{formatted_glossary}

	Chinese chapter:
	{chapter['content']}
	Note: No introductory sentences nor concluding sentences. Just directly provide the translation.
	Translate the above text to English, using the glossary for consistent translations of key terms:"""

	translation = None

	# Try Gemini first
	print("Falling back to Gemini...")
	for attempt in range(2):
	try:
	gemini_response = gemini_model.generate_content(prompt)
	translation = gemini_response.text
	break
	except Exception as e:
	print(f"Gemini error (attempt {attempt + 1}): {str(e)}")
	if attempt == 1:
	print("Gemini failed. Falling back to Groq LLaMA model")
	else:
	time.sleep(30)

	# If Gemini failed, try Groq
	if not translation and groq_client:
	for attempt in range(2):
	try:
	chat_completion = groq_client.chat.completions.create(
	messages=[{"role": "user", "content": prompt}],
	model="llama3-70b-8192",
	timeout=30
	)
	translation = chat_completion.choices[0].message.content
	break
	except Exception as e:
	print(f"Groq error (attempt {attempt + 1}): {str(e)}")
	if attempt == 1:
	print(f"Failed to translate Chapter {i + 1} after all attempts")
	translation = f"TRANSLATION FAILED: {chapter['title']}"
	else:
	time.sleep(30)

	# Add the complete chapter translation to results
	translations.append({
	'title': chapter['title'],
	'translated_content': translation
	})
	print(f"Completed translation of Chapter {i + 1}")
	print("First 500 characters of translation:")
	print(translation[:500] + "...")
	print('=======================================')
	time.sleep(5) # Sleep between requests

	# Save all translations
	with open(TRANSLATIONS_FILE, 'w', encoding='utf-8') as f:
	json.dump(translations, f, ensure_ascii=False, indent=2)
	print(f'Translation completed. Translations saved to {TRANSLATIONS_FILE}')

	# Gradio Interface
	def process_novel(first_chapter_url, final_url, novel_name, gemini_api_key, groq_api_key):
	# Scrape chapters
	asyncio.run(scrape_task(first_chapter_url, final_url))

	# Process chapters (split long chapters)
	process_chapters(CHAPTERS_FILE, SPLIT_CHAPTERS_FILE)

	create_glossary(gemini_api_key, groq_api_key)

	# Translate chapters
	translate_task(gemini_api_key, groq_api_key)

	return "Scraping, Processing, and Translation Completed!"

	# Gradio Interface
	iface = gr.Interface(
	fn=process_novel,
	inputs=[
	gr.Textbox(label="First Chapter URL"),
	gr.Textbox(label="Final Chapter URL (optional)"),
	gr.Textbox(label="Novel Name"),
	gr.Textbox(label="Gemini API Key"),
	gr.Textbox(label="Groq API Key (optional)"),
	],
	outputs="text",
	title="Novel Scraper and Translator",
	description="Input the first chapter URL, final chapter URL (optional), novel name, and API keys to scrape and translate the novel."
	)

	iface.launch()