untold / app.py
Dragneel's picture
Upload folder using huggingface_hub
b0920a5 verified
import gradio as gr
import json
import time
import requests
from groq import Groq
import google.generativeai as genai
from datetime import datetime, timedelta
import pytz
from playwright.async_api import async_playwright
import asyncio
import random
from fake_useragent import UserAgent
from urllib.parse import urlparse, urljoin
from tenacity import retry, stop_after_attempt, wait_exponential
import os
import subprocess
# Install Playwright and browsers
# subprocess.run(["playwright", "install"], check=True)
# subprocess.run(["playwright", "install-deps"], check=True)
# os.system("apt-get update")
os.system(
"apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 "
"libxcomposite1 libxdamage1 libatspi2.0-0 libxrandr2 libgbm1 libpango-1.0-0 "
"libasound2 libxshmfence1 libwayland-server0 libwayland-client0 "
"libgdk-pixbuf2.0-0"
)
# Install Playwright browsers
# subprocess.run(["sudo","playwright", "install"], check=True)
# Install Playwright dependencies
# subprocess.run(["sudo","playwright", "install-deps"], check=True)
# print("Playwright and its dependencies have been installed successfully!")
# Constants
CHAPTERS_FILE = 'scraped_chapters.json'
SPLIT_CHAPTERS_FILE = 'split_scraped_chapters.json'
TRANSLATIONS_FILE = 'chapter_translated.json'
GLOSSARY_FILE = 'chapter_glossary.json'
ua = UserAgent()
# Function to scrape chapters from xbanxia
async def scrape_xbanxia(first_chapter_url, final_url=None):
async with async_playwright() as p:
browser = await p.chromium.launch()
context = await browser.new_context(user_agent=ua.random)
page = await context.new_page()
try:
page = await fetch_page(page, first_chapter_url)
chapters = []
next_url = first_chapter_url
chapter_count = 0
while next_url and (not final_url or next_url != final_url):
try:
if chapter_count % 5 == 0:
await context.set_extra_http_headers({"User-Agent": ua.random})
page = await fetch_page(page, next_url)
# Wait for content to load
await page.wait_for_selector('#nr_title', state='visible', timeout=60000)
await page.wait_for_selector('#nr1', state='visible', timeout=60000)
# Extract title
title_element = await page.query_selector('#nr_title')
title = await title_element.inner_text() if title_element else None
# Extract content
content_element = await page.query_selector('#nr1')
content = await content_element.inner_text() if content_element else None
# Extract next URL
next_link = await page.query_selector('.nav2 .next a')
next_url = await next_link.get_attribute('href') if next_link else None
if next_url and not next_url.startswith('http'):
base_url = '/'.join(first_chapter_url.split('/')[:3])
next_url = base_url + next_url
if title and content:
# Clean up the content
content_lines = content.split('\n')
clean_content = '\n'.join(line.strip() for line in content_lines
if line.strip() and not line.strip().startswith('第'))
chapters.append({
'title': title.strip(),
'content': clean_content,
'url': next_url
})
print(f"Scraped chapter {chapter_count + 1}: {title}")
chapter_count += 1
# Random delay between requests
await asyncio.sleep(random.uniform(2, 5))
except Exception as e:
print(f"Error scraping chapter at {next_url}: {str(e)}")
await asyncio.sleep(60) # Wait for 1 minute before retrying
await browser.close()
return chapters
except Exception as e:
print(f"An error occurred during scraping: {str(e)}")
await browser.close()
return None
# Function to scrape chapters from 69shuba.cx
async def scrape_69shu(first_chapter_url, final_url=None):
async with async_playwright() as p:
browser = await p.chromium.launch()
context = await browser.new_context(user_agent=ua.random)
page = await context.new_page()
try:
# Navigate to the first chapter
page = await fetch_page(page, first_chapter_url)
chapters = []
next_url = first_chapter_url
chapter_count = 0
while next_url and (not final_url or next_url != final_url):
try:
# Change user agent every 5 chapters
if chapter_count % 5 == 0:
await context.set_extra_http_headers({"User-Agent": ua.random})
page = await fetch_page(page, next_url)
await page.wait_for_selector('.txtnav', state='visible', timeout=60000)
# Extract title
title_element = await page.query_selector('.txtnav h1')
title = await title_element.inner_text() if title_element else None
if not title:
title_element = await page.query_selector('.txtnav')
if title_element:
title_text = await title_element.inner_text()
title = title_text.split('\n')[0].strip()
# Extract content
content_element = await page.query_selector('.txtnav')
content = await content_element.inner_text() if content_element else None
# Extract next URL
next_link = await page.query_selector('.page1 a:nth-child(4)')
next_url = await next_link.get_attribute('href') if next_link else None
if title and content:
# Clean up the content
content_lines = content.split('\n')
clean_content = '\n'.join(line.strip() for line in content_lines if line.strip() and not line.strip().startswith('Chapter'))
chapters.append({
'title': title,
'content': clean_content,
'url': next_url
})
print(f"Scraped chapter {chapter_count + 1}: {title}")
chapter_count += 1
# Add a random delay between requests
await asyncio.sleep(random.uniform(2, 5))
except Exception as e:
print(f"Error scraping chapter at {next_url}: {str(e)}")
await asyncio.sleep(60) # Wait for 1 minute before trying the next chapter
await browser.close()
return chapters
except Exception as e:
print(f"An error occurred during scraping: {str(e)}")
await browser.close()
return None
# Function to fetch a page
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def fetch_page(page, url):
await page.goto(url)
await page.wait_for_load_state('networkidle')
return page
# Function to scrape chapters based on the domain
async def scrape_task(first_chapter_url, final_url=None):
"""Scrape chapters and save to JSON file."""
domain = urlparse(first_chapter_url).netloc
if 'xbanxia' in domain:
result = await scrape_xbanxia(first_chapter_url, final_url)
elif '69shuba.cx' in domain:
result = await scrape_69shu(first_chapter_url, final_url)
else:
print(f"Unsupported domain: {domain}")
return
if result:
with open(CHAPTERS_FILE, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f'Scraping completed. Data saved to {CHAPTERS_FILE}')
else:
print('Scraping failed or was interrupted.')
# Function to split long chapters
def split_long_chapter(title, content, max_length=2000):
"""
Split a long chapter into multiple parts while preserving paragraph and sentence integrity.
Splits occur at newline (\n) or sentence-ending symbol (。).
"""
# Count only Chinese characters for length check
chinese_char_count = sum(1 for char in content if '\u4e00' <= char <= '\u9fff')
if chinese_char_count <= max_length:
return [{"title": title, "content": content}]
parts = []
current_part = []
current_chinese_count = 0
part_number = 1
# First split by paragraphs (newlines)
paragraphs = content.split('\n')
for paragraph in paragraphs:
if not paragraph.strip():
continue
# Split paragraph into sentences
sentences = paragraph.split('。')
sentences = [s.strip() + '。' for s in sentences if s.strip()]
for sentence in sentences:
sentence_chinese_count = sum(1 for char in sentence if '\u4e00' <= char <= '\u9fff')
# If adding this sentence would exceed the limit
if current_chinese_count + sentence_chinese_count > max_length and current_part:
# Save current part
part_content = '\n'.join(current_part)
parts.append({
"title": f"{title} Part {part_number}",
"content": part_content
})
# Start new part
current_part = [sentence]
current_chinese_count = sentence_chinese_count
part_number += 1
else:
current_part.append(sentence)
current_chinese_count += sentence_chinese_count
# Save the last part if there's anything remaining
if current_part:
part_content = '\n'.join(current_part)
parts.append({
"title": f"{title} Part {part_number}",
"content": part_content
})
return parts
# Function to process chapters
def process_chapters(input_file, output_file, max_length=5000):
"""
Process chapters from an input JSON file, splitting long chapters if necessary,
and save the result to an output JSON file.
"""
try:
# Check if the input file exists
if not os.path.exists(input_file):
raise FileNotFoundError(f"Input file '{input_file}' not found. Please ensure the scraping process runs first.")
with open(input_file, 'r', encoding='utf-8') as f:
chapters = json.load(f)
processed_chapters = []
for chapter in chapters:
title = chapter['title']
content = chapter['content']
split_chapters = split_long_chapter(title, content, max_length)
processed_chapters.extend(split_chapters)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(processed_chapters, f, ensure_ascii=False, indent=2)
return len(processed_chapters)
except Exception as e:
print(f"Error processing chapters: {str(e)}")
raise
def create_glossary(gemini_api_key, groq_api_key=None):
"""Create a glossary from random chapters using Groq or Gemini API."""
with open(SPLIT_CHAPTERS_FILE, 'r', encoding='utf-8') as f:
book_data = json.load(f) # book_data is a list of chapters
# Select 20 random chapters from the first 100 chapters
random_chapters = random.sample(book_data, min(2, len(book_data)))
preliminary_glossary = []
for i, chapter in enumerate(random_chapters):
max_retries = 3
retry_count = 0
while retry_count < max_retries:
try:
prompt = f"""Analyze the following Chinese web novel chapter and create a glossary of 5 important terms or names. Each entry should include the Chinese term and its English equivalent or explanation. Translate character names, locations names, unique concepts, cultivation levels, power levels, power techniques, or culturally specific terms to English.
The target audience are people from USA that don't know much about Chinese language and culture.
Very important Note: Only Use Pinyin for Character's Name.
Chinese chapter:
{chapter['content']}
Create a glossary of 5 terms in the following format:
Chinese Term: English Equivalent
for example: 朱士久 : Zhu Shijiu
"""
if groq_api_key:
# Use Groq API if the key is provided
client = Groq(api_key=groq_api_key)
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="llama3-70b-8192",
timeout=30
)
chapter_glossary = chat_completion.choices[0].message.content
else:
# Fallback to Gemini API if Groq key is not provided
genai.configure(api_key=gemini_api_key)
gemini_model = genai.GenerativeModel(
model_name="gemini-1.5-flash",
generation_config={
"temperature": 1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
}
)
gemini_response = gemini_model.generate_content(prompt)
chapter_glossary = gemini_response.text
preliminary_glossary.extend(chapter_glossary.split('\n'))
print(f"Created glossary entries for chapter: {chapter['title']}")
break
except Exception as e:
retry_count += 1
if retry_count < max_retries:
print(f"Error processing chapter {chapter['title']}: {str(e)}")
print(f"Retrying in 60 seconds... (Attempt {retry_count + 1} of {max_retries})")
time.sleep(60)
else:
print(f"Failed to process chapter {chapter['title']} after {max_retries} attempts: {str(e)}")
time.sleep(5)
# Refine the glossary
refine_prompt = """Refine the following glossary for a Chinese web novel. Remove duplicates, redundant entries, and irrelevant words. Ensure consistency in naming and explanations.
Provide the output in JSON Format.
Preliminary Glossary:
{}
Retain people's names in Pinyin format (e.g., Chen Jingle), but fully translate all other terms, phrases, and concepts into English. Avoid using Pinyin for non-name elements to ensure clarity and natural flow for English readers.
Provide the refined glossary in the following format:
Chinese Characters: English Equivalent (No Explanations)
for example: 朱士久 : Zhu Shijiu
白家 : Bai Family
成长系统: Growth System ( not "Chengzhang Xitong)
""".format('\n'.join(preliminary_glossary))
try:
if groq_api_key:
# Use Groq API for refinement if the key is provided
client = Groq(api_key=groq_api_key)
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": refine_prompt}],
model="llama3-70b-8192",
timeout=60
)
refined_glossary = chat_completion.choices[0].message.content
else:
# Fallback to Gemini API for refinement
gemini_response = gemini_model.generate_content(refine_prompt)
refined_glossary = gemini_response.text
# Save the refined glossary
with open(GLOSSARY_FILE, 'w', encoding='utf-8') as f:
json.dump(refined_glossary.split('\n'), f, ensure_ascii=False, indent=2)
print(f'Glossary creation completed. Glossary saved to {GLOSSARY_FILE}')
except Exception as e:
print(f"Error refining glossary: {str(e)}")
raise
# Function to translate chapters
def translate_task(gemini_api_key, groq_api_key):
# Configure Gemini
genai.configure(api_key=gemini_api_key)
gemini_model = genai.GenerativeModel(
model_name="gemini-1.5-flash",
generation_config={
"temperature": 1,
"top_p": 0.95,
"top_k": 64,
"max_output_tokens": 8192,
}
)
# Load data and configuration
with open(SPLIT_CHAPTERS_FILE, 'r', encoding='utf-8') as f:
book_data = json.load(f)
with open(GLOSSARY_FILE, 'r', encoding='utf-8') as f:
glossary = json.load(f)
formatted_glossary = "\n".join(glossary)
# Configure Groq
groq_client = Groq(api_key=groq_api_key) if groq_api_key else None
translations = []
for i, chapter in enumerate(book_data):
prompt = f"""Translate the following Chinese web novel chapter to English. Maintain the original tone and style of the novel. Preserve any cultural references or idioms, providing brief explanations in parentheses if necessary.
If Paragraphs are stuck together, split them. Retain people's names in Pinyin format (e.g., Chen Jingle), but fully translate all other terms, phrases, and concepts into English. Avoid using Pinyin for non-name elements to ensure clarity and natural flow for English readers.
You should translate every chinese character to English. The chapter should be fully translated.
Glossary:
{formatted_glossary}
Chinese chapter:
{chapter['content']}
Note: No introductory sentences nor concluding sentences. Just directly provide the translation.
Translate the above text to English, using the glossary for consistent translations of key terms:"""
translation = None
# Try Gemini first
print("Falling back to Gemini...")
for attempt in range(2):
try:
gemini_response = gemini_model.generate_content(prompt)
translation = gemini_response.text
break
except Exception as e:
print(f"Gemini error (attempt {attempt + 1}): {str(e)}")
if attempt == 1:
print("Gemini failed. Falling back to Groq LLaMA model")
else:
time.sleep(30)
# If Gemini failed, try Groq
if not translation and groq_client:
for attempt in range(2):
try:
chat_completion = groq_client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="llama3-70b-8192",
timeout=30
)
translation = chat_completion.choices[0].message.content
break
except Exception as e:
print(f"Groq error (attempt {attempt + 1}): {str(e)}")
if attempt == 1:
print(f"Failed to translate Chapter {i + 1} after all attempts")
translation = f"TRANSLATION FAILED: {chapter['title']}"
else:
time.sleep(30)
# Add the complete chapter translation to results
translations.append({
'title': chapter['title'],
'translated_content': translation
})
print(f"Completed translation of Chapter {i + 1}")
print("First 500 characters of translation:")
print(translation[:500] + "...")
print('=======================================')
time.sleep(5) # Sleep between requests
# Save all translations
with open(TRANSLATIONS_FILE, 'w', encoding='utf-8') as f:
json.dump(translations, f, ensure_ascii=False, indent=2)
print(f'Translation completed. Translations saved to {TRANSLATIONS_FILE}')
# Gradio Interface
def process_novel(first_chapter_url, final_url, novel_name, gemini_api_key, groq_api_key):
# Scrape chapters
asyncio.run(scrape_task(first_chapter_url, final_url))
# Process chapters (split long chapters)
process_chapters(CHAPTERS_FILE, SPLIT_CHAPTERS_FILE)
create_glossary(gemini_api_key, groq_api_key)
# Translate chapters
translate_task(gemini_api_key, groq_api_key)
return "Scraping, Processing, and Translation Completed!"
# Gradio Interface
iface = gr.Interface(
fn=process_novel,
inputs=[
gr.Textbox(label="First Chapter URL"),
gr.Textbox(label="Final Chapter URL (optional)"),
gr.Textbox(label="Novel Name"),
gr.Textbox(label="Gemini API Key"),
gr.Textbox(label="Groq API Key (optional)"),
],
outputs="text",
title="Novel Scraper and Translator",
description="Input the first chapter URL, final chapter URL (optional), novel name, and API keys to scrape and translate the novel."
)
iface.launch()