yomitalk / tests /data /create_sample_pdf.py
KyosukeIchikawa's picture
PyMuPDF (fitz) の依存関係を削除し、VOICEVOXの初期化処理を改善
de0b32e
"""
Script to create a sample PDF for testing the PDF extraction feature.
"""
import os
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from tests.utils.logger import test_logger as logger
def create_sample_pdf(output_path="sample_paper.pdf"):
"""Create a sample academic paper PDF for testing"""
# Ensure the output directory exists
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
# ページサイズを取得(幅と高さ)
page_width, page_height = letter
# Create PDF canvas
c = canvas.Canvas(output_path, pagesize=letter)
# 余白を設定
margin = 50
text_width = page_width - 2 * margin
# 行の高さとセクション間のスペースを定義
line_height = 15
section_space = 50
# 現在のY座標(ページ上部から開始)
y = page_height - margin
# 最小Y座標(これ以下になったら新しいページ)
min_y = margin + 50
# Title
c.setFont("Helvetica-Bold", 18)
c.drawString(margin, y, "Sample Paper")
y -= 30 # タイトルの後のスペース
# Author information
c.setFont("Helvetica", 12)
c.drawString(margin, y, "Author: Taro Yamada")
y -= 20
c.drawString(margin, y, "Affiliation: Sample University")
y -= section_space # 著者情報の後のセクション間スペース
# Abstract
c.setFont("Helvetica-Bold", 14)
c.drawString(margin, y, "Abstract")
y -= 20
c.setFont("Helvetica", 12)
abstract = """
This is a sample research paper PDF for testing. It is used for functionality
testing of the Paper Podcast Generator. This test will verify that text is
correctly extracted from this PDF and properly processed.
"""
# Draw multiline text
lines = abstract.strip().split("\n")
for line in lines:
if line.strip(): # 空行をスキップ
c.drawString(margin, y, line.strip())
y -= line_height
# 次のセクションへのスペースを追加
y -= section_space
# Introduction
c.setFont("Helvetica-Bold", 14)
c.drawString(margin, y, "1. Introduction")
y -= 20
c.setFont("Helvetica", 12)
intro = """
In recent years, media development for wider dissemination of research papers
has received attention. Especially, podcast format as audio content helps busy
researchers and students effectively use their commuting time. This research
proposes a system that automatically converts research papers into podcast format.
The importance of research accessibility has been highlighted in numerous studies.
Traditional research papers are often limited to academic communities, while multimedia
formats can reach broader audiences including practitioners, policymakers, and the
general public interested in scientific advancements.
"""
lines = intro.strip().split("\n")
for line in lines:
if line.strip():
c.drawString(margin, y, line.strip())
y -= line_height
# 次のセクションへのスペースを追加
y -= section_space
# Method
c.setFont("Helvetica-Bold", 14)
c.drawString(margin, y, "2. Method")
y -= 20
c.setFont("Helvetica", 12)
method = """
The proposed system converts research papers into podcasts using the following steps:
1. Text extraction from PDF
2. Text summarization and formatting
3. Conversion to podcast format
4. Audio generation using speech synthesis
For speech synthesis, character voices specialized for Japanese like "Zundamon"
are used to provide friendly audio content.
The system architecture consists of several modular components that can be customized
based on specific requirements. The PDF parsing module extracts text while preserving
the document structure, including headings, paragraphs, and references. The summarization
module employs natural language processing techniques to identify key information and
create a concise narrative suitable for audio consumption.
"""
lines = method.strip().split("\n")
for line in lines:
if line.strip():
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.setFont("Helvetica", 12)
c.drawString(margin, y, line.strip())
y -= line_height
# 次のセクションへのスペースを追加
y -= section_space
# Results
c.setFont("Helvetica-Bold", 14)
c.drawString(margin, y, "3. Results")
y -= 20
c.setFont("Helvetica", 12)
results = """
The evaluation experiments showed that podcasts generated by the proposed system
achieved 90% information retention compared to manually created ones.
In user evaluations, the system also received high ratings for the naturalness
of the voice and the ease of understanding the content.
Detailed analysis revealed several interesting findings:
- Audio quality was rated 4.5/5 on average by 50 participants
- Comprehension tests showed 85% accuracy for technical content
- Time savings compared to reading the full paper: approximately 75%
- User satisfaction was significantly higher (p<0.01) for papers with
clear structure and well-defined sections
These results suggest that automated paper-to-podcast conversion can successfully
translate complex research into accessible audio format while maintaining the
essential information and scientific integrity of the original work.
"""
lines = results.strip().split("\n")
for line in lines:
if line.strip():
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.setFont("Helvetica", 12)
c.drawString(margin, y, line.strip())
y -= line_height
# 次のセクションへのスペースを追加
y -= section_space
# Conclusion
c.setFont("Helvetica-Bold", 14)
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.drawString(margin, y, "4. Conclusion")
y -= 20
c.setFont("Helvetica", 12)
conclusion = """
In this research, we proposed an automated paper-to-podcast conversion system
and confirmed its effectiveness. Future challenges include support for more diverse
paper styles and multilingual support.
The system demonstrates the potential of using AI to bridge the gap between
academic writing and public dissemination of research findings. As research
output continues to grow exponentially, tools that facilitate knowledge
transfer will become increasingly important.
Future work will focus on expanding language support, improving handling of
complex scientific notation and mathematical formulae, and developing domain-specific
models for fields such as medicine, physics, and computer science. We also plan to
explore interactive features that would allow listeners to navigate complex content
more effectively.
"""
lines = conclusion.strip().split("\n")
for line in lines:
if line.strip():
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.setFont("Helvetica", 12)
c.drawString(margin, y, line.strip())
y -= line_height
# 次のセクションへのスペースを追加
y -= section_space
# References
c.setFont("Helvetica-Bold", 14)
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.drawString(margin, y, "References")
y -= 20
c.setFont("Helvetica", 12)
references = [
"1. Yamada, T. (2023). 'Latest Trends in Speech Synthesis Technology'. Journal of Speech Processing, 15(2), 123-135.",
"2. Sato, H. (2022). 'Effects of Media Development in Research Paper Dissemination'. Journal of Academic Information, 8(3), 45-52.",
"3. Yamada, T. & Sato, H. (2023). 'Automatic podcast generation from academic papers'. Journal of AI Applications, 10(4), 210-225.",
"4. Johnson, L. et al. (2021). 'Converting Scientific Papers to Audio: Challenges and Opportunities'. Proceedings of the International Conference on Audio Technology, 78-92.",
"5. Garcia, M. (2022). 'Voice Synthesis for Academic Content'. Digital Library Research Journal, 5(1), 45-67.",
"6. Tanaka, K. (2021). 'Analysis of Information Retention in Different Media Formats'. Cognitive Science Quarterly, 33(2), 228-244.",
"7. Smith, J. & Brown, K. (2022). 'Accessibility of Research Findings Through Alternative Media'. Journal of Science Communication, 14(3), 112-134.",
]
for ref in references:
# 長い参考文献を折り返す
words = ref.split()
line = ""
for word in words:
test_line = line + " " + word if line else word
if c.stringWidth(test_line, "Helvetica", 12) < text_width:
line = test_line
else:
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.setFont("Helvetica", 12)
c.drawString(margin, y, line)
y -= line_height
line = word
if line:
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.setFont("Helvetica", 12)
c.drawString(margin, y, line)
y -= 20 # 参考文献間のスペース
# PDFを保存(最後のページを確定)
c.save()
return output_path
if __name__ == "__main__":
# Create a sample PDF when the script is executed
current_dir = os.path.dirname(os.path.abspath(__file__))
output_path = os.path.join(current_dir, "sample_paper.pdf")
created_path = create_sample_pdf(output_path)
logger.info(f"Sample PDF created: {created_path}")