|
|
import gradio as gr |
|
|
import json |
|
|
import time |
|
|
import requests |
|
|
from groq import Groq |
|
|
import google.generativeai as genai |
|
|
from datetime import datetime, timedelta |
|
|
import pytz |
|
|
from playwright.async_api import async_playwright |
|
|
import asyncio |
|
|
import random |
|
|
from fake_useragent import UserAgent |
|
|
from urllib.parse import urlparse, urljoin |
|
|
from tenacity import retry, stop_after_attempt, wait_exponential |
|
|
import os |
|
|
import subprocess |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
os.system( |
|
|
"apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 " |
|
|
"libxcomposite1 libxdamage1 libatspi2.0-0 libxrandr2 libgbm1 libpango-1.0-0 " |
|
|
"libasound2 libxshmfence1 libwayland-server0 libwayland-client0 " |
|
|
"libgdk-pixbuf2.0-0" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CHAPTERS_FILE = 'scraped_chapters.json' |
|
|
SPLIT_CHAPTERS_FILE = 'split_scraped_chapters.json' |
|
|
TRANSLATIONS_FILE = 'chapter_translated.json' |
|
|
GLOSSARY_FILE = 'chapter_glossary.json' |
|
|
|
|
|
ua = UserAgent() |
|
|
|
|
|
|
|
|
async def scrape_xbanxia(first_chapter_url, final_url=None): |
|
|
async with async_playwright() as p: |
|
|
browser = await p.chromium.launch() |
|
|
context = await browser.new_context(user_agent=ua.random) |
|
|
page = await context.new_page() |
|
|
try: |
|
|
page = await fetch_page(page, first_chapter_url) |
|
|
chapters = [] |
|
|
next_url = first_chapter_url |
|
|
chapter_count = 0 |
|
|
|
|
|
while next_url and (not final_url or next_url != final_url): |
|
|
try: |
|
|
if chapter_count % 5 == 0: |
|
|
await context.set_extra_http_headers({"User-Agent": ua.random}) |
|
|
|
|
|
page = await fetch_page(page, next_url) |
|
|
|
|
|
|
|
|
await page.wait_for_selector('#nr_title', state='visible', timeout=60000) |
|
|
await page.wait_for_selector('#nr1', state='visible', timeout=60000) |
|
|
|
|
|
|
|
|
title_element = await page.query_selector('#nr_title') |
|
|
title = await title_element.inner_text() if title_element else None |
|
|
|
|
|
|
|
|
content_element = await page.query_selector('#nr1') |
|
|
content = await content_element.inner_text() if content_element else None |
|
|
|
|
|
|
|
|
next_link = await page.query_selector('.nav2 .next a') |
|
|
next_url = await next_link.get_attribute('href') if next_link else None |
|
|
|
|
|
if next_url and not next_url.startswith('http'): |
|
|
base_url = '/'.join(first_chapter_url.split('/')[:3]) |
|
|
next_url = base_url + next_url |
|
|
|
|
|
if title and content: |
|
|
|
|
|
content_lines = content.split('\n') |
|
|
clean_content = '\n'.join(line.strip() for line in content_lines |
|
|
if line.strip() and not line.strip().startswith('第')) |
|
|
|
|
|
chapters.append({ |
|
|
'title': title.strip(), |
|
|
'content': clean_content, |
|
|
'url': next_url |
|
|
}) |
|
|
|
|
|
print(f"Scraped chapter {chapter_count + 1}: {title}") |
|
|
chapter_count += 1 |
|
|
|
|
|
|
|
|
await asyncio.sleep(random.uniform(2, 5)) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error scraping chapter at {next_url}: {str(e)}") |
|
|
await asyncio.sleep(60) |
|
|
|
|
|
await browser.close() |
|
|
return chapters |
|
|
|
|
|
except Exception as e: |
|
|
print(f"An error occurred during scraping: {str(e)}") |
|
|
await browser.close() |
|
|
return None |
|
|
|
|
|
|
|
|
async def scrape_69shu(first_chapter_url, final_url=None): |
|
|
async with async_playwright() as p: |
|
|
browser = await p.chromium.launch() |
|
|
context = await browser.new_context(user_agent=ua.random) |
|
|
page = await context.new_page() |
|
|
|
|
|
try: |
|
|
|
|
|
page = await fetch_page(page, first_chapter_url) |
|
|
|
|
|
chapters = [] |
|
|
next_url = first_chapter_url |
|
|
chapter_count = 0 |
|
|
|
|
|
while next_url and (not final_url or next_url != final_url): |
|
|
try: |
|
|
|
|
|
if chapter_count % 5 == 0: |
|
|
await context.set_extra_http_headers({"User-Agent": ua.random}) |
|
|
|
|
|
page = await fetch_page(page, next_url) |
|
|
await page.wait_for_selector('.txtnav', state='visible', timeout=60000) |
|
|
|
|
|
|
|
|
title_element = await page.query_selector('.txtnav h1') |
|
|
title = await title_element.inner_text() if title_element else None |
|
|
|
|
|
if not title: |
|
|
title_element = await page.query_selector('.txtnav') |
|
|
if title_element: |
|
|
title_text = await title_element.inner_text() |
|
|
title = title_text.split('\n')[0].strip() |
|
|
|
|
|
|
|
|
content_element = await page.query_selector('.txtnav') |
|
|
content = await content_element.inner_text() if content_element else None |
|
|
|
|
|
|
|
|
next_link = await page.query_selector('.page1 a:nth-child(4)') |
|
|
next_url = await next_link.get_attribute('href') if next_link else None |
|
|
|
|
|
if title and content: |
|
|
|
|
|
content_lines = content.split('\n') |
|
|
clean_content = '\n'.join(line.strip() for line in content_lines if line.strip() and not line.strip().startswith('Chapter')) |
|
|
|
|
|
chapters.append({ |
|
|
'title': title, |
|
|
'content': clean_content, |
|
|
'url': next_url |
|
|
}) |
|
|
|
|
|
print(f"Scraped chapter {chapter_count + 1}: {title}") |
|
|
chapter_count += 1 |
|
|
|
|
|
|
|
|
await asyncio.sleep(random.uniform(2, 5)) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error scraping chapter at {next_url}: {str(e)}") |
|
|
await asyncio.sleep(60) |
|
|
|
|
|
await browser.close() |
|
|
return chapters |
|
|
|
|
|
except Exception as e: |
|
|
print(f"An error occurred during scraping: {str(e)}") |
|
|
await browser.close() |
|
|
return None |
|
|
|
|
|
|
|
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) |
|
|
async def fetch_page(page, url): |
|
|
await page.goto(url) |
|
|
await page.wait_for_load_state('networkidle') |
|
|
return page |
|
|
|
|
|
|
|
|
async def scrape_task(first_chapter_url, final_url=None): |
|
|
"""Scrape chapters and save to JSON file.""" |
|
|
domain = urlparse(first_chapter_url).netloc |
|
|
if 'xbanxia' in domain: |
|
|
result = await scrape_xbanxia(first_chapter_url, final_url) |
|
|
elif '69shuba.cx' in domain: |
|
|
result = await scrape_69shu(first_chapter_url, final_url) |
|
|
else: |
|
|
print(f"Unsupported domain: {domain}") |
|
|
return |
|
|
if result: |
|
|
with open(CHAPTERS_FILE, 'w', encoding='utf-8') as f: |
|
|
json.dump(result, f, ensure_ascii=False, indent=2) |
|
|
print(f'Scraping completed. Data saved to {CHAPTERS_FILE}') |
|
|
else: |
|
|
print('Scraping failed or was interrupted.') |
|
|
|
|
|
|
|
|
def split_long_chapter(title, content, max_length=2000): |
|
|
""" |
|
|
Split a long chapter into multiple parts while preserving paragraph and sentence integrity. |
|
|
Splits occur at newline (\n) or sentence-ending symbol (。). |
|
|
""" |
|
|
|
|
|
chinese_char_count = sum(1 for char in content if '\u4e00' <= char <= '\u9fff') |
|
|
|
|
|
if chinese_char_count <= max_length: |
|
|
return [{"title": title, "content": content}] |
|
|
|
|
|
parts = [] |
|
|
current_part = [] |
|
|
current_chinese_count = 0 |
|
|
part_number = 1 |
|
|
|
|
|
|
|
|
paragraphs = content.split('\n') |
|
|
|
|
|
for paragraph in paragraphs: |
|
|
if not paragraph.strip(): |
|
|
continue |
|
|
|
|
|
|
|
|
sentences = paragraph.split('。') |
|
|
sentences = [s.strip() + '。' for s in sentences if s.strip()] |
|
|
|
|
|
for sentence in sentences: |
|
|
sentence_chinese_count = sum(1 for char in sentence if '\u4e00' <= char <= '\u9fff') |
|
|
|
|
|
|
|
|
if current_chinese_count + sentence_chinese_count > max_length and current_part: |
|
|
|
|
|
part_content = '\n'.join(current_part) |
|
|
parts.append({ |
|
|
"title": f"{title} Part {part_number}", |
|
|
"content": part_content |
|
|
}) |
|
|
|
|
|
current_part = [sentence] |
|
|
current_chinese_count = sentence_chinese_count |
|
|
part_number += 1 |
|
|
else: |
|
|
current_part.append(sentence) |
|
|
current_chinese_count += sentence_chinese_count |
|
|
|
|
|
|
|
|
if current_part: |
|
|
part_content = '\n'.join(current_part) |
|
|
parts.append({ |
|
|
"title": f"{title} Part {part_number}", |
|
|
"content": part_content |
|
|
}) |
|
|
|
|
|
return parts |
|
|
|
|
|
|
|
|
def process_chapters(input_file, output_file, max_length=5000): |
|
|
""" |
|
|
Process chapters from an input JSON file, splitting long chapters if necessary, |
|
|
and save the result to an output JSON file. |
|
|
""" |
|
|
try: |
|
|
|
|
|
if not os.path.exists(input_file): |
|
|
raise FileNotFoundError(f"Input file '{input_file}' not found. Please ensure the scraping process runs first.") |
|
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f: |
|
|
chapters = json.load(f) |
|
|
|
|
|
processed_chapters = [] |
|
|
for chapter in chapters: |
|
|
title = chapter['title'] |
|
|
content = chapter['content'] |
|
|
|
|
|
split_chapters = split_long_chapter(title, content, max_length) |
|
|
processed_chapters.extend(split_chapters) |
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
|
json.dump(processed_chapters, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
return len(processed_chapters) |
|
|
except Exception as e: |
|
|
print(f"Error processing chapters: {str(e)}") |
|
|
raise |
|
|
|
|
|
def create_glossary(gemini_api_key, groq_api_key=None): |
|
|
"""Create a glossary from random chapters using Groq or Gemini API.""" |
|
|
with open(SPLIT_CHAPTERS_FILE, 'r', encoding='utf-8') as f: |
|
|
book_data = json.load(f) |
|
|
|
|
|
|
|
|
random_chapters = random.sample(book_data, min(2, len(book_data))) |
|
|
|
|
|
preliminary_glossary = [] |
|
|
|
|
|
for i, chapter in enumerate(random_chapters): |
|
|
max_retries = 3 |
|
|
retry_count = 0 |
|
|
|
|
|
while retry_count < max_retries: |
|
|
try: |
|
|
prompt = f"""Analyze the following Chinese web novel chapter and create a glossary of 5 important terms or names. Each entry should include the Chinese term and its English equivalent or explanation. Translate character names, locations names, unique concepts, cultivation levels, power levels, power techniques, or culturally specific terms to English. |
|
|
The target audience are people from USA that don't know much about Chinese language and culture. |
|
|
Very important Note: Only Use Pinyin for Character's Name. |
|
|
|
|
|
Chinese chapter: |
|
|
{chapter['content']} |
|
|
|
|
|
|
|
|
Create a glossary of 5 terms in the following format: |
|
|
Chinese Term: English Equivalent |
|
|
for example: 朱士久 : Zhu Shijiu |
|
|
|
|
|
""" |
|
|
|
|
|
if groq_api_key: |
|
|
|
|
|
client = Groq(api_key=groq_api_key) |
|
|
chat_completion = client.chat.completions.create( |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
model="llama3-70b-8192", |
|
|
timeout=30 |
|
|
) |
|
|
chapter_glossary = chat_completion.choices[0].message.content |
|
|
else: |
|
|
|
|
|
genai.configure(api_key=gemini_api_key) |
|
|
gemini_model = genai.GenerativeModel( |
|
|
model_name="gemini-1.5-flash", |
|
|
generation_config={ |
|
|
"temperature": 1, |
|
|
"top_p": 0.95, |
|
|
"top_k": 64, |
|
|
"max_output_tokens": 8192, |
|
|
} |
|
|
) |
|
|
gemini_response = gemini_model.generate_content(prompt) |
|
|
chapter_glossary = gemini_response.text |
|
|
|
|
|
preliminary_glossary.extend(chapter_glossary.split('\n')) |
|
|
print(f"Created glossary entries for chapter: {chapter['title']}") |
|
|
break |
|
|
except Exception as e: |
|
|
retry_count += 1 |
|
|
if retry_count < max_retries: |
|
|
print(f"Error processing chapter {chapter['title']}: {str(e)}") |
|
|
print(f"Retrying in 60 seconds... (Attempt {retry_count + 1} of {max_retries})") |
|
|
time.sleep(60) |
|
|
else: |
|
|
print(f"Failed to process chapter {chapter['title']} after {max_retries} attempts: {str(e)}") |
|
|
|
|
|
time.sleep(5) |
|
|
|
|
|
|
|
|
refine_prompt = """Refine the following glossary for a Chinese web novel. Remove duplicates, redundant entries, and irrelevant words. Ensure consistency in naming and explanations. |
|
|
Provide the output in JSON Format. |
|
|
Preliminary Glossary: |
|
|
{} |
|
|
|
|
|
Retain people's names in Pinyin format (e.g., Chen Jingle), but fully translate all other terms, phrases, and concepts into English. Avoid using Pinyin for non-name elements to ensure clarity and natural flow for English readers. |
|
|
Provide the refined glossary in the following format: |
|
|
Chinese Characters: English Equivalent (No Explanations) |
|
|
for example: 朱士久 : Zhu Shijiu |
|
|
白家 : Bai Family |
|
|
成长系统: Growth System ( not "Chengzhang Xitong) |
|
|
""".format('\n'.join(preliminary_glossary)) |
|
|
|
|
|
try: |
|
|
if groq_api_key: |
|
|
|
|
|
client = Groq(api_key=groq_api_key) |
|
|
chat_completion = client.chat.completions.create( |
|
|
messages=[{"role": "user", "content": refine_prompt}], |
|
|
model="llama3-70b-8192", |
|
|
timeout=60 |
|
|
) |
|
|
refined_glossary = chat_completion.choices[0].message.content |
|
|
else: |
|
|
|
|
|
gemini_response = gemini_model.generate_content(refine_prompt) |
|
|
refined_glossary = gemini_response.text |
|
|
|
|
|
|
|
|
with open(GLOSSARY_FILE, 'w', encoding='utf-8') as f: |
|
|
json.dump(refined_glossary.split('\n'), f, ensure_ascii=False, indent=2) |
|
|
print(f'Glossary creation completed. Glossary saved to {GLOSSARY_FILE}') |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error refining glossary: {str(e)}") |
|
|
raise |
|
|
|
|
|
|
|
|
def translate_task(gemini_api_key, groq_api_key): |
|
|
|
|
|
genai.configure(api_key=gemini_api_key) |
|
|
gemini_model = genai.GenerativeModel( |
|
|
model_name="gemini-1.5-flash", |
|
|
generation_config={ |
|
|
"temperature": 1, |
|
|
"top_p": 0.95, |
|
|
"top_k": 64, |
|
|
"max_output_tokens": 8192, |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
with open(SPLIT_CHAPTERS_FILE, 'r', encoding='utf-8') as f: |
|
|
book_data = json.load(f) |
|
|
with open(GLOSSARY_FILE, 'r', encoding='utf-8') as f: |
|
|
glossary = json.load(f) |
|
|
formatted_glossary = "\n".join(glossary) |
|
|
|
|
|
|
|
|
groq_client = Groq(api_key=groq_api_key) if groq_api_key else None |
|
|
|
|
|
translations = [] |
|
|
|
|
|
for i, chapter in enumerate(book_data): |
|
|
prompt = f"""Translate the following Chinese web novel chapter to English. Maintain the original tone and style of the novel. Preserve any cultural references or idioms, providing brief explanations in parentheses if necessary. |
|
|
If Paragraphs are stuck together, split them. Retain people's names in Pinyin format (e.g., Chen Jingle), but fully translate all other terms, phrases, and concepts into English. Avoid using Pinyin for non-name elements to ensure clarity and natural flow for English readers. |
|
|
You should translate every chinese character to English. The chapter should be fully translated. |
|
|
Glossary: |
|
|
{formatted_glossary} |
|
|
|
|
|
Chinese chapter: |
|
|
{chapter['content']} |
|
|
Note: No introductory sentences nor concluding sentences. Just directly provide the translation. |
|
|
Translate the above text to English, using the glossary for consistent translations of key terms:""" |
|
|
|
|
|
translation = None |
|
|
|
|
|
|
|
|
print("Falling back to Gemini...") |
|
|
for attempt in range(2): |
|
|
try: |
|
|
gemini_response = gemini_model.generate_content(prompt) |
|
|
translation = gemini_response.text |
|
|
break |
|
|
except Exception as e: |
|
|
print(f"Gemini error (attempt {attempt + 1}): {str(e)}") |
|
|
if attempt == 1: |
|
|
print("Gemini failed. Falling back to Groq LLaMA model") |
|
|
else: |
|
|
time.sleep(30) |
|
|
|
|
|
|
|
|
if not translation and groq_client: |
|
|
for attempt in range(2): |
|
|
try: |
|
|
chat_completion = groq_client.chat.completions.create( |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
model="llama3-70b-8192", |
|
|
timeout=30 |
|
|
) |
|
|
translation = chat_completion.choices[0].message.content |
|
|
break |
|
|
except Exception as e: |
|
|
print(f"Groq error (attempt {attempt + 1}): {str(e)}") |
|
|
if attempt == 1: |
|
|
print(f"Failed to translate Chapter {i + 1} after all attempts") |
|
|
translation = f"TRANSLATION FAILED: {chapter['title']}" |
|
|
else: |
|
|
time.sleep(30) |
|
|
|
|
|
|
|
|
translations.append({ |
|
|
'title': chapter['title'], |
|
|
'translated_content': translation |
|
|
}) |
|
|
print(f"Completed translation of Chapter {i + 1}") |
|
|
print("First 500 characters of translation:") |
|
|
print(translation[:500] + "...") |
|
|
print('=======================================') |
|
|
time.sleep(5) |
|
|
|
|
|
|
|
|
with open(TRANSLATIONS_FILE, 'w', encoding='utf-8') as f: |
|
|
json.dump(translations, f, ensure_ascii=False, indent=2) |
|
|
print(f'Translation completed. Translations saved to {TRANSLATIONS_FILE}') |
|
|
|
|
|
|
|
|
def process_novel(first_chapter_url, final_url, novel_name, gemini_api_key, groq_api_key): |
|
|
|
|
|
asyncio.run(scrape_task(first_chapter_url, final_url)) |
|
|
|
|
|
|
|
|
process_chapters(CHAPTERS_FILE, SPLIT_CHAPTERS_FILE) |
|
|
|
|
|
create_glossary(gemini_api_key, groq_api_key) |
|
|
|
|
|
|
|
|
translate_task(gemini_api_key, groq_api_key) |
|
|
|
|
|
return "Scraping, Processing, and Translation Completed!" |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=process_novel, |
|
|
inputs=[ |
|
|
gr.Textbox(label="First Chapter URL"), |
|
|
gr.Textbox(label="Final Chapter URL (optional)"), |
|
|
gr.Textbox(label="Novel Name"), |
|
|
gr.Textbox(label="Gemini API Key"), |
|
|
gr.Textbox(label="Groq API Key (optional)"), |
|
|
], |
|
|
outputs="text", |
|
|
title="Novel Scraper and Translator", |
|
|
description="Input the first chapter URL, final chapter URL (optional), novel name, and API keys to scrape and translate the novel." |
|
|
) |
|
|
|
|
|
iface.launch() |