File size: 10,555 Bytes
de0b32e e41e94d de0b32e e41e94d de0b32e e41e94d de0b32e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 | """
Script to create a sample PDF for testing the PDF extraction feature.
"""
import os
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from tests.utils.logger import test_logger as logger
def create_sample_pdf(output_path="sample_paper.pdf"):
"""Create a sample academic paper PDF for testing"""
# Ensure the output directory exists
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
# ページサイズを取得(幅と高さ)
page_width, page_height = letter
# Create PDF canvas
c = canvas.Canvas(output_path, pagesize=letter)
# 余白を設定
margin = 50
text_width = page_width - 2 * margin
# 行の高さとセクション間のスペースを定義
line_height = 15
section_space = 50
# 現在のY座標(ページ上部から開始)
y = page_height - margin
# 最小Y座標(これ以下になったら新しいページ)
min_y = margin + 50
# Title
c.setFont("Helvetica-Bold", 18)
c.drawString(margin, y, "Sample Paper")
y -= 30 # タイトルの後のスペース
# Author information
c.setFont("Helvetica", 12)
c.drawString(margin, y, "Author: Taro Yamada")
y -= 20
c.drawString(margin, y, "Affiliation: Sample University")
y -= section_space # 著者情報の後のセクション間スペース
# Abstract
c.setFont("Helvetica-Bold", 14)
c.drawString(margin, y, "Abstract")
y -= 20
c.setFont("Helvetica", 12)
abstract = """
This is a sample research paper PDF for testing. It is used for functionality
testing of the Paper Podcast Generator. This test will verify that text is
correctly extracted from this PDF and properly processed.
"""
# Draw multiline text
lines = abstract.strip().split("\n")
for line in lines:
if line.strip(): # 空行をスキップ
c.drawString(margin, y, line.strip())
y -= line_height
# 次のセクションへのスペースを追加
y -= section_space
# Introduction
c.setFont("Helvetica-Bold", 14)
c.drawString(margin, y, "1. Introduction")
y -= 20
c.setFont("Helvetica", 12)
intro = """
In recent years, media development for wider dissemination of research papers
has received attention. Especially, podcast format as audio content helps busy
researchers and students effectively use their commuting time. This research
proposes a system that automatically converts research papers into podcast format.
The importance of research accessibility has been highlighted in numerous studies.
Traditional research papers are often limited to academic communities, while multimedia
formats can reach broader audiences including practitioners, policymakers, and the
general public interested in scientific advancements.
"""
lines = intro.strip().split("\n")
for line in lines:
if line.strip():
c.drawString(margin, y, line.strip())
y -= line_height
# 次のセクションへのスペースを追加
y -= section_space
# Method
c.setFont("Helvetica-Bold", 14)
c.drawString(margin, y, "2. Method")
y -= 20
c.setFont("Helvetica", 12)
method = """
The proposed system converts research papers into podcasts using the following steps:
1. Text extraction from PDF
2. Text summarization and formatting
3. Conversion to podcast format
4. Audio generation using speech synthesis
For speech synthesis, character voices specialized for Japanese like "Zundamon"
are used to provide friendly audio content.
The system architecture consists of several modular components that can be customized
based on specific requirements. The PDF parsing module extracts text while preserving
the document structure, including headings, paragraphs, and references. The summarization
module employs natural language processing techniques to identify key information and
create a concise narrative suitable for audio consumption.
"""
lines = method.strip().split("\n")
for line in lines:
if line.strip():
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.setFont("Helvetica", 12)
c.drawString(margin, y, line.strip())
y -= line_height
# 次のセクションへのスペースを追加
y -= section_space
# Results
c.setFont("Helvetica-Bold", 14)
c.drawString(margin, y, "3. Results")
y -= 20
c.setFont("Helvetica", 12)
results = """
The evaluation experiments showed that podcasts generated by the proposed system
achieved 90% information retention compared to manually created ones.
In user evaluations, the system also received high ratings for the naturalness
of the voice and the ease of understanding the content.
Detailed analysis revealed several interesting findings:
- Audio quality was rated 4.5/5 on average by 50 participants
- Comprehension tests showed 85% accuracy for technical content
- Time savings compared to reading the full paper: approximately 75%
- User satisfaction was significantly higher (p<0.01) for papers with
clear structure and well-defined sections
These results suggest that automated paper-to-podcast conversion can successfully
translate complex research into accessible audio format while maintaining the
essential information and scientific integrity of the original work.
"""
lines = results.strip().split("\n")
for line in lines:
if line.strip():
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.setFont("Helvetica", 12)
c.drawString(margin, y, line.strip())
y -= line_height
# 次のセクションへのスペースを追加
y -= section_space
# Conclusion
c.setFont("Helvetica-Bold", 14)
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.drawString(margin, y, "4. Conclusion")
y -= 20
c.setFont("Helvetica", 12)
conclusion = """
In this research, we proposed an automated paper-to-podcast conversion system
and confirmed its effectiveness. Future challenges include support for more diverse
paper styles and multilingual support.
The system demonstrates the potential of using AI to bridge the gap between
academic writing and public dissemination of research findings. As research
output continues to grow exponentially, tools that facilitate knowledge
transfer will become increasingly important.
Future work will focus on expanding language support, improving handling of
complex scientific notation and mathematical formulae, and developing domain-specific
models for fields such as medicine, physics, and computer science. We also plan to
explore interactive features that would allow listeners to navigate complex content
more effectively.
"""
lines = conclusion.strip().split("\n")
for line in lines:
if line.strip():
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.setFont("Helvetica", 12)
c.drawString(margin, y, line.strip())
y -= line_height
# 次のセクションへのスペースを追加
y -= section_space
# References
c.setFont("Helvetica-Bold", 14)
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.drawString(margin, y, "References")
y -= 20
c.setFont("Helvetica", 12)
references = [
"1. Yamada, T. (2023). 'Latest Trends in Speech Synthesis Technology'. Journal of Speech Processing, 15(2), 123-135.",
"2. Sato, H. (2022). 'Effects of Media Development in Research Paper Dissemination'. Journal of Academic Information, 8(3), 45-52.",
"3. Yamada, T. & Sato, H. (2023). 'Automatic podcast generation from academic papers'. Journal of AI Applications, 10(4), 210-225.",
"4. Johnson, L. et al. (2021). 'Converting Scientific Papers to Audio: Challenges and Opportunities'. Proceedings of the International Conference on Audio Technology, 78-92.",
"5. Garcia, M. (2022). 'Voice Synthesis for Academic Content'. Digital Library Research Journal, 5(1), 45-67.",
"6. Tanaka, K. (2021). 'Analysis of Information Retention in Different Media Formats'. Cognitive Science Quarterly, 33(2), 228-244.",
"7. Smith, J. & Brown, K. (2022). 'Accessibility of Research Findings Through Alternative Media'. Journal of Science Communication, 14(3), 112-134.",
]
for ref in references:
# 長い参考文献を折り返す
words = ref.split()
line = ""
for word in words:
test_line = line + " " + word if line else word
if c.stringWidth(test_line, "Helvetica", 12) < text_width:
line = test_line
else:
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.setFont("Helvetica", 12)
c.drawString(margin, y, line)
y -= line_height
line = word
if line:
# ページの下部に達したら新しいページを開始
if y < min_y:
c.showPage()
y = page_height - margin
c.setFont("Helvetica", 12)
c.drawString(margin, y, line)
y -= 20 # 参考文献間のスペース
# PDFを保存(最後のページを確定)
c.save()
return output_path
if __name__ == "__main__":
# Create a sample PDF when the script is executed
current_dir = os.path.dirname(os.path.abspath(__file__))
output_path = os.path.join(current_dir, "sample_paper.pdf")
created_path = create_sample_pdf(output_path)
logger.info(f"Sample PDF created: {created_path}")
|