| """ |
| Script to create a sample PDF for testing the PDF extraction feature. |
| """ |
|
|
| import os |
|
|
| from reportlab.lib.pagesizes import letter |
| from reportlab.pdfgen import canvas |
|
|
| from tests.utils.logger import test_logger as logger |
|
|
|
|
| def create_sample_pdf(output_path="sample_paper.pdf"): |
| """Create a sample academic paper PDF for testing""" |
| |
| output_dir = os.path.dirname(output_path) |
| if output_dir and not os.path.exists(output_dir): |
| os.makedirs(output_dir) |
|
|
| |
| page_width, page_height = letter |
|
|
| |
| c = canvas.Canvas(output_path, pagesize=letter) |
|
|
| |
| margin = 50 |
| text_width = page_width - 2 * margin |
|
|
| |
| line_height = 15 |
| section_space = 50 |
|
|
| |
| y = page_height - margin |
|
|
| |
| min_y = margin + 50 |
|
|
| |
| c.setFont("Helvetica-Bold", 18) |
| c.drawString(margin, y, "Sample Paper") |
| y -= 30 |
|
|
| |
| c.setFont("Helvetica", 12) |
| c.drawString(margin, y, "Author: Taro Yamada") |
| y -= 20 |
| c.drawString(margin, y, "Affiliation: Sample University") |
| y -= section_space |
|
|
| |
| c.setFont("Helvetica-Bold", 14) |
| c.drawString(margin, y, "Abstract") |
| y -= 20 |
|
|
| c.setFont("Helvetica", 12) |
| abstract = """ |
| This is a sample research paper PDF for testing. It is used for functionality |
| testing of the Paper Podcast Generator. This test will verify that text is |
| correctly extracted from this PDF and properly processed. |
| """ |
|
|
| |
| lines = abstract.strip().split("\n") |
| for line in lines: |
| if line.strip(): |
| c.drawString(margin, y, line.strip()) |
| y -= line_height |
|
|
| |
| y -= section_space |
|
|
| |
| c.setFont("Helvetica-Bold", 14) |
| c.drawString(margin, y, "1. Introduction") |
| y -= 20 |
|
|
| c.setFont("Helvetica", 12) |
| intro = """ |
| In recent years, media development for wider dissemination of research papers |
| has received attention. Especially, podcast format as audio content helps busy |
| researchers and students effectively use their commuting time. This research |
| proposes a system that automatically converts research papers into podcast format. |
| |
| The importance of research accessibility has been highlighted in numerous studies. |
| Traditional research papers are often limited to academic communities, while multimedia |
| formats can reach broader audiences including practitioners, policymakers, and the |
| general public interested in scientific advancements. |
| """ |
|
|
| lines = intro.strip().split("\n") |
| for line in lines: |
| if line.strip(): |
| c.drawString(margin, y, line.strip()) |
| y -= line_height |
|
|
| |
| y -= section_space |
|
|
| |
| c.setFont("Helvetica-Bold", 14) |
| c.drawString(margin, y, "2. Method") |
| y -= 20 |
|
|
| c.setFont("Helvetica", 12) |
| method = """ |
| The proposed system converts research papers into podcasts using the following steps: |
| |
| 1. Text extraction from PDF |
| 2. Text summarization and formatting |
| 3. Conversion to podcast format |
| 4. Audio generation using speech synthesis |
| |
| For speech synthesis, character voices specialized for Japanese like "Zundamon" |
| are used to provide friendly audio content. |
| |
| The system architecture consists of several modular components that can be customized |
| based on specific requirements. The PDF parsing module extracts text while preserving |
| the document structure, including headings, paragraphs, and references. The summarization |
| module employs natural language processing techniques to identify key information and |
| create a concise narrative suitable for audio consumption. |
| """ |
|
|
| lines = method.strip().split("\n") |
| for line in lines: |
| if line.strip(): |
| |
| if y < min_y: |
| c.showPage() |
| y = page_height - margin |
| c.setFont("Helvetica", 12) |
| c.drawString(margin, y, line.strip()) |
| y -= line_height |
|
|
| |
| y -= section_space |
|
|
| |
| c.setFont("Helvetica-Bold", 14) |
| c.drawString(margin, y, "3. Results") |
| y -= 20 |
|
|
| c.setFont("Helvetica", 12) |
| results = """ |
| The evaluation experiments showed that podcasts generated by the proposed system |
| achieved 90% information retention compared to manually created ones. |
| In user evaluations, the system also received high ratings for the naturalness |
| of the voice and the ease of understanding the content. |
| |
| Detailed analysis revealed several interesting findings: |
| |
| - Audio quality was rated 4.5/5 on average by 50 participants |
| - Comprehension tests showed 85% accuracy for technical content |
| - Time savings compared to reading the full paper: approximately 75% |
| - User satisfaction was significantly higher (p<0.01) for papers with |
| clear structure and well-defined sections |
| |
| These results suggest that automated paper-to-podcast conversion can successfully |
| translate complex research into accessible audio format while maintaining the |
| essential information and scientific integrity of the original work. |
| """ |
|
|
| lines = results.strip().split("\n") |
| for line in lines: |
| if line.strip(): |
| |
| if y < min_y: |
| c.showPage() |
| y = page_height - margin |
| c.setFont("Helvetica", 12) |
| c.drawString(margin, y, line.strip()) |
| y -= line_height |
|
|
| |
| y -= section_space |
|
|
| |
| c.setFont("Helvetica-Bold", 14) |
|
|
| |
| if y < min_y: |
| c.showPage() |
| y = page_height - margin |
|
|
| c.drawString(margin, y, "4. Conclusion") |
| y -= 20 |
|
|
| c.setFont("Helvetica", 12) |
| conclusion = """ |
| In this research, we proposed an automated paper-to-podcast conversion system |
| and confirmed its effectiveness. Future challenges include support for more diverse |
| paper styles and multilingual support. |
| |
| The system demonstrates the potential of using AI to bridge the gap between |
| academic writing and public dissemination of research findings. As research |
| output continues to grow exponentially, tools that facilitate knowledge |
| transfer will become increasingly important. |
| |
| Future work will focus on expanding language support, improving handling of |
| complex scientific notation and mathematical formulae, and developing domain-specific |
| models for fields such as medicine, physics, and computer science. We also plan to |
| explore interactive features that would allow listeners to navigate complex content |
| more effectively. |
| """ |
|
|
| lines = conclusion.strip().split("\n") |
| for line in lines: |
| if line.strip(): |
| |
| if y < min_y: |
| c.showPage() |
| y = page_height - margin |
| c.setFont("Helvetica", 12) |
| c.drawString(margin, y, line.strip()) |
| y -= line_height |
|
|
| |
| y -= section_space |
|
|
| |
| c.setFont("Helvetica-Bold", 14) |
|
|
| |
| if y < min_y: |
| c.showPage() |
| y = page_height - margin |
|
|
| c.drawString(margin, y, "References") |
| y -= 20 |
|
|
| c.setFont("Helvetica", 12) |
| references = [ |
| "1. Yamada, T. (2023). 'Latest Trends in Speech Synthesis Technology'. Journal of Speech Processing, 15(2), 123-135.", |
| "2. Sato, H. (2022). 'Effects of Media Development in Research Paper Dissemination'. Journal of Academic Information, 8(3), 45-52.", |
| "3. Yamada, T. & Sato, H. (2023). 'Automatic podcast generation from academic papers'. Journal of AI Applications, 10(4), 210-225.", |
| "4. Johnson, L. et al. (2021). 'Converting Scientific Papers to Audio: Challenges and Opportunities'. Proceedings of the International Conference on Audio Technology, 78-92.", |
| "5. Garcia, M. (2022). 'Voice Synthesis for Academic Content'. Digital Library Research Journal, 5(1), 45-67.", |
| "6. Tanaka, K. (2021). 'Analysis of Information Retention in Different Media Formats'. Cognitive Science Quarterly, 33(2), 228-244.", |
| "7. Smith, J. & Brown, K. (2022). 'Accessibility of Research Findings Through Alternative Media'. Journal of Science Communication, 14(3), 112-134.", |
| ] |
|
|
| for ref in references: |
| |
| words = ref.split() |
| line = "" |
| for word in words: |
| test_line = line + " " + word if line else word |
| if c.stringWidth(test_line, "Helvetica", 12) < text_width: |
| line = test_line |
| else: |
| |
| if y < min_y: |
| c.showPage() |
| y = page_height - margin |
| c.setFont("Helvetica", 12) |
| c.drawString(margin, y, line) |
| y -= line_height |
| line = word |
| if line: |
| |
| if y < min_y: |
| c.showPage() |
| y = page_height - margin |
| c.setFont("Helvetica", 12) |
| c.drawString(margin, y, line) |
| y -= 20 |
|
|
| |
| c.save() |
|
|
| return output_path |
|
|
|
|
| if __name__ == "__main__": |
| |
| current_dir = os.path.dirname(os.path.abspath(__file__)) |
| output_path = os.path.join(current_dir, "sample_paper.pdf") |
|
|
| created_path = create_sample_pdf(output_path) |
| logger.info(f"Sample PDF created: {created_path}") |
|
|