Spaces:
Runtime error
Runtime error
| from gpt_index import Document, GPTListIndex | |
| import gradio as gr | |
| import openai | |
| import os | |
| import PyPDF2 | |
| import docx | |
| import pytesseract | |
| from PIL import Image | |
| def pdftotext(file_name): | |
| """ | |
| Function to extract text from .pdf format files | |
| """ | |
| text = [] | |
| # Open the PDF file in read-binary mode | |
| with open(file_name, 'rb') as file: | |
| # Create a PDF object | |
| pdf = PyPDF2.PdfReader(file) | |
| # Get the number of pages in the PDF document | |
| num_pages = len(pdf.pages) | |
| # Iterate over every page | |
| for page in range(num_pages): | |
| # Extract the text from the page | |
| result = pdf.pages[page].extract_text() | |
| text.append(result) | |
| text = "\n".join(text) | |
| return text | |
| def docxtotext(file_name): | |
| """ | |
| Function to read .docx format files | |
| """ | |
| # Open the Word document | |
| document = docx.Document(file_name) | |
| # Extract the text from the document | |
| text = '\n'.join([paragraph.text for paragraph in document.paragraphs]) | |
| return text | |
| def readtextfile(file_name): | |
| """ | |
| Function to read .txt format files | |
| """ | |
| # Open the Text document | |
| with open(file_name, 'r') as file: | |
| text = file.read() | |
| return text | |
| def imagetotext(file_name): | |
| """ | |
| Function to extract text from images | |
| """ | |
| # Open the image using PIL | |
| image = Image.open(file_name) | |
| # Extract the text from the image | |
| text = pytesseract.image_to_string(image) | |
| return text | |
| def preprocesstext(text): | |
| """ | |
| Function to preprocess text | |
| """ | |
| # Split the string into lines | |
| lines = text.splitlines() | |
| # Use a list comprehension to filter out empty lines | |
| lines = [line for line in lines if line.strip()] | |
| # Join the modified lines back into a single string | |
| text = '\n'.join(lines) | |
| return text | |
| def processfiles(files): | |
| """ | |
| Function to extract text from documents | |
| """ | |
| textlist = [] | |
| # Iterate over provided files | |
| for file in files: | |
| # Get file name | |
| file_name = file.name | |
| # Get extention of file name | |
| ext = file_name.split(".")[-1].lower() | |
| # Process document based on extention | |
| if ext == "pdf": | |
| text = pdftotext(file_name) | |
| elif ext == "docx": | |
| text = docxtotext(file_name) | |
| elif ext == "txt": | |
| text = readtextfile(file_name) | |
| elif ext in ["png", "jpg", "jpeg"]: | |
| text = imagetotext(file_name) | |
| else: | |
| text = "" | |
| # Preprocess text | |
| text = preprocesstext(text) | |
| # Append the text to final result | |
| textlist.append(text) | |
| return textlist | |
| def createdocuments(textlist): | |
| """ | |
| Function to create documents as needed for indexing. | |
| """ | |
| documents = [] | |
| # Create Document for indexing | |
| for text in textlist: | |
| documents.append(Document(text)) | |
| return documents | |
| def fileformatvaliditycheck(files): | |
| """ | |
| Function to check validity of file formats | |
| """ | |
| for file1 in files: | |
| file_name = file1.name | |
| # Get extention of file name | |
| ext = file_name.split(".")[-1].lower() | |
| if ext not in ["pdf", "txt", "docx", "png", "jpg", "jpeg"]: | |
| return False | |
| return True | |
| def openaiapikeyvaliditycheck(openaikey): | |
| """ | |
| Function to check validity of openai key | |
| """ | |
| # Set the API key | |
| openai.api_key = openaikey | |
| # Test the API key by making a request to the OpenAI API | |
| try: | |
| response = openai.Model.list() | |
| return "Valid OpenAI API key" | |
| except openai.OpenAIError: | |
| apikeylink = "https://beta.openai.com/account/api-keys" | |
| return f"Incorrect OpenAI API key provided: {openaikey}. You can find your OpenAI API key here - {apikeylink}" | |
| def createindex(files, openaikey): | |
| """ | |
| Function to create index | |
| """ | |
| # Basic Checks | |
| if not files: | |
| return "Upload file before proceeding further." | |
| fileformatvalidity = fileformatvaliditycheck(files) | |
| if not fileformatvalidity: | |
| return "Please upload documents in pdf/txt/docx/png/jpg/jpeg format only." | |
| if not openaikey: | |
| return "Please enter your openai key." | |
| openaiapikeyvality = openaiapikeyvaliditycheck(openaikey) | |
| if openaiapikeyvality != "Valid OpenAI API key": | |
| return openaiapikeyvality | |
| # Store openai key in environment | |
| os.environ['OPENAI_API_KEY'] = openaikey | |
| # Process the Documents | |
| doctextlist = processfiles(files) | |
| documents = createdocuments(doctextlist) | |
| # Create index | |
| index = GPTListIndex(documents, chunk_size_limit = 3500) | |
| # Save index | |
| index.save_to_disk('index.json') | |
| return "Uploading documents successfully. OpenAI API Key provided is Valid." | |
| def docques(query, openaikey): | |
| """ | |
| Function to for quering on the index created | |
| """ | |
| # Store openai key in environment | |
| os.environ['OPENAI_API_KEY'] = openaikey | |
| # Load index | |
| index = GPTListIndex.load_from_disk('index.json') | |
| # Query based on index | |
| response = index.query(query, response_mode="tree_summarize") | |
| return response | |
| def cleartext(query, output): | |
| """ | |
| Function to clear text | |
| """ | |
| return ["", ""] | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| <h1><center><b>DocQues</center></h1> | |
| """) | |
| gr.Markdown( | |
| """ | |
| This app answers your queries on longer and multiple documents (pdf/docx/txt/png/jpeg/jpg) you upload. It uses <a href = "https://github.com/jerryjliu/gpt_index">GPT-Index</a> and OpenAI GPT3 in the backend, get your | |
| <a href = "https://beta.openai.com/account/api-keys">Openai key here</a> before proceeding further.\n | |
| """) | |
| gr.Markdown( | |
| """ | |
| <br>**Use this space effectively by following below 2 step process.**</br> | |
| *Step-1* | |
| <br>- Upload pdf/docx/txt/png/jpeg/jpg format documents. | |
| <br>- Enter your openai key. | |
| <br>- Click upload and wait to see if upload is successful or not. </br> | |
| *Step-2* | |
| <br>- Enter your query. | |
| <br>- Click submit. | |
| <br>- Check Answer </br> | |
| Please refer to the GitHub repo this Space is based on, here - <a href = "https://github.com/ravi03071991/DocQues">DocQues</a> . | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| files = gr.File(label = "Upload pdf/docx/txt format documents.", file_count="multiple") | |
| openaikey = gr.Textbox(lines = 1, label = "Enter your OpenAI Key.") | |
| upload_button = gr.Button("Upload") | |
| query = gr.Textbox(lines = 2, label = "Enter Your Question.") | |
| submit_button = gr.Button("Submit") | |
| with gr.Column(): | |
| upload_output = gr.Textbox(label = "Upload/ Error.") | |
| ans_output = gr.Textbox(label = "Answer.") | |
| clear_button = gr.Button("Clear") | |
| # Upload button for uploading files and openai key. | |
| upload_button.click(createindex, inputs=[files, openaikey], outputs= [upload_output] ) | |
| # Submit button for submitting query. | |
| submit_button.click(docques, inputs=[query, openaikey], outputs= [ans_output] ) | |
| # Clear button for clearing query and answer. | |
| clear_button.click(cleartext, inputs=[query, ans_output], outputs= [query, ans_output] ) | |
| demo.launch() |