| import gradio as gr | |
| import warnings | |
| from typing import List | |
| import json | |
| from pdfitdown.pdfconversion import convert_to_pdf, convert_markdown_to_pdf | |
| from base_utils import ( | |
| convert_pdf_to_image, | |
| extract_text_from_pdf, | |
| convert_doc_to_text, | |
| extract_text_from_docx, | |
| extract_text_from_ppt, | |
| extract_text_from_pptx, | |
| sanitize_list_of_lists, | |
| parse_url, | |
| ) | |
| pdf_to_img = gr.Interface( | |
| convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img" | |
| ) | |
| pdf_to_text = gr.Interface( | |
| extract_text_from_pdf, | |
| gr.File(), | |
| gr.Textbox(placeholder="Extracted text will appear here"), | |
| api_name="pdf_to_text", | |
| ) | |
| doc_to_text = gr.Interface( | |
| convert_doc_to_text, gr.File(), gr.Textbox(), api_name="doc_to_text" | |
| ) | |
| docx_to_text = gr.Interface( | |
| extract_text_from_docx, gr.File(), gr.Textbox(), api_name="docx_to_text" | |
| ) | |
| ppt_to_text = gr.Interface( | |
| extract_text_from_ppt, | |
| gr.File(), | |
| gr.Textbox(), | |
| api_name="ppt_to_text", | |
| ) | |
| pptx_to_text = gr.Interface( | |
| extract_text_from_pptx, | |
| gr.File(), | |
| gr.Textbox(), | |
| api_name="pptx_to_text", | |
| ) | |
| str_to_json = gr.Interface( | |
| sanitize_list_of_lists, | |
| gr.Text(), | |
| gr.JSON(), | |
| api_name="str_to_json", | |
| examples=[ | |
| """[ | |
| ["What year was the Carthaginian Empire founded?", "Around 814 BCE"], | |
| ["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"], | |
| ["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"], | |
| ["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"], | |
| ["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"], | |
| ["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"], | |
| ["In what year was Carthage captured and destroyed by Rome?", "146 BCE"], | |
| ["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"] | |
| ]""" | |
| ], | |
| ) | |
| url_parser = gr.Interface( | |
| parse_url, | |
| inputs=["text"], | |
| outputs=["text"], | |
| api_name="url_to_text", | |
| ) | |
| class FileNotConvertedWarning(Warning): | |
| """The file was not in one of the specified formats for conversion to PDF""" | |
| pass | |
| def to_pdf(files: List[str]) -> List[str]: | |
| pdfs = [] | |
| for f in files: | |
| if f.endswith(".docx"): | |
| newfile = f.replace(".docx", ".pdf") | |
| file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) | |
| pdfs.append(file_to_add) | |
| elif f.endswith(".pdf"): | |
| pdfs.append(f) | |
| elif f.endswith(".html"): | |
| newfile = f.replace(".html", ".pdf") | |
| file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) | |
| pdfs.append(file_to_add) | |
| elif f.endswith(".pptx"): | |
| newfile = f.replace(".pptx", ".pdf") | |
| file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) | |
| pdfs.append(file_to_add) | |
| elif f.endswith(".csv"): | |
| newfile = f.replace(".csv", ".pdf") | |
| file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) | |
| pdfs.append(file_to_add) | |
| elif f.endswith(".xml"): | |
| newfile = f.replace(".xml", ".pdf") | |
| file_to_add = convert_to_pdf(f, newfile, newfile.split(".")[0]) | |
| pdfs.append(file_to_add) | |
| elif f.endswith(".md"): | |
| newfile = f.replace(".md", ".pdf") | |
| file_to_add = convert_markdown_to_pdf(f, newfile, newfile.split(".")[0]) | |
| pdfs.append(file_to_add) | |
| else: | |
| warnings.warn( | |
| f"File {f} was not converted to PDF because its file format is not included in those that can be converted", | |
| FileNotConvertedWarning, | |
| ) | |
| continue | |
| return pdfs | |
| def convert(file: str) -> str: | |
| files = [file] | |
| pdfs = to_pdf(files) | |
| return pdfs | |
| def parse_MCQs(mcq_string: str) -> List[List[str]]: | |
| mcq_string = "[" + mcq_string.split("[", 1)[1] | |
| json_data = mcq_string.rsplit("]", 1)[0] + "]" | |
| json_data = json.loads(json_data) | |
| return json_data | |
| mcqs_to_json = gr.Interface( | |
| parse_MCQs, | |
| gr.Textbox(), | |
| gr.JSON(), | |
| api_name="mcqs_to_json", | |
| examples=[ | |
| [ | |
| """```json | |
| [ | |
| { | |
| "question": "Which of the following best describes the nature of business?", | |
| "options": { | |
| "A": "It is primarily a non-economic activity", | |
| "B": "It involves personal consumption of goods", | |
| "C": "It includes regular and continuous transactions for profit", | |
| "D": "It excludes exchange of goods and services" | |
| }, | |
| "answer": "C" | |
| }, | |
| { | |
| "question": "According to the document, what is a primary objective of business under economic objectives?", | |
| "options": { | |
| "A": "Employee welfare", | |
| "B": "Profit earning", | |
| "C": "Creating entertainment content", | |
| "D": "Reducing government involvement" | |
| }, | |
| "answer": "B" | |
| }, | |
| { | |
| "question": "Which of the following is a component of commerce?", | |
| "options": { | |
| "A": "Mining", | |
| "B": "Manufacturing", | |
| "C": "Warehousing", | |
| "D": "Farming" | |
| }, | |
| "answer": "C" | |
| }, | |
| { | |
| "question": "What is an example of a synthetic manufacturing industry?", | |
| "options": { | |
| "A": "Oil refining", | |
| "B": "Textile processing", | |
| "C": "Soap production", | |
| "D": "Watch assembly" | |
| }, | |
| "answer": "C" | |
| }, | |
| { | |
| "question": "Which aid to trade helps in overcoming the hindrance of knowledge in commerce?", | |
| "options": { | |
| "A": "Banking", | |
| "B": "Insurance", | |
| "C": "Advertising", | |
| "D": "Warehousing" | |
| }, | |
| "answer": "C" | |
| } | |
| ] | |
| ``` | |
| """ | |
| ] | |
| ], | |
| cache_examples=False, | |
| ) | |
| pdf_converter = gr.Interface( | |
| fn=convert, | |
| inputs=gr.File(label="Upload your file"), | |
| outputs=gr.File(label="Converted PDF"), | |
| title="File to PDF Converter", | |
| description="Upload a file in .docx, .pdf, .html, .pptx, .csv, .xml, or .md format, and get it converted to PDF.", | |
| api_name="convert_to_pdf", | |
| ) | |
| demo = gr.TabbedInterface( | |
| [ | |
| pdf_to_img, | |
| pdf_to_text, | |
| doc_to_text, | |
| docx_to_text, | |
| ppt_to_text, | |
| pptx_to_text, | |
| url_parser, | |
| str_to_json, | |
| mcqs_to_json, | |
| pdf_converter, | |
| ], | |
| [ | |
| "PDF to Image", | |
| "Extract PDF Text", | |
| "Extract DOC Text", | |
| "Extract DOCX Text", | |
| "Extract PPT Text", | |
| "Extract PPTX Text", | |
| "Extract text from URL", | |
| "Extract Json", | |
| "Parse MCQs", | |
| "Convert to PDF", | |
| ], | |
| ) | |
| demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True) | |