Spaces:

cogcorp
/

assignment1

Sleeping

App Files Files Community

cogcorp commited on May 24, 2023

Commit

0b87fda

1 Parent(s): 10589bd

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -81

app.py CHANGED Viewed

@@ -1,94 +1,39 @@
 import os
 import zipfile
-import openai
 import gradio as gr
-from gradio import components as grc
-# Set up OpenAI API credentials
-openai.api_key = "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"
-# Function to extract text from PDF using OpenAI API
-def extract_text_from_pdf(pdf_path):
-    with open(pdf_path, "rb") as f:
-        pdf_bytes = f.read()
-    response = openai.Completion.create(
-        engine="text-davinci-003",
-        prompt=pdf_bytes.decode("utf-8"),
-        max_tokens=2048,
-        temperature=0.7,
-        n=1,
-        stop=None,
-        timeout=120,
-    )
-    return response.choices[0].text.strip()
-# Function to extract text from multiple PDFs in a ZIP archive
-def extract_text_from_zip(zip_file):
-    corpus = ""
-    with zipfile.ZipFile(zip_file, "r") as zip_ref:
-        for file_name in zip_ref.namelist():
-            if file_name.endswith(".pdf"):
-                extracted_text = extract_text_from_pdf(zip_ref.read(file_name))
-                corpus += extracted_text + "\n"
-    return corpus
-# Function to split text into chunks based on maximum token length
-def split_text_into_chunks(text, max_tokens=2048):
-    chunks = []
-    words = text.split()
-    current_chunk = ""
-    for word in words:
-        if len(current_chunk) + len(word) <= max_tokens:
-            current_chunk += word + " "
-        else:
-            chunks.append(current_chunk.strip())
-            current_chunk = word + " "
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks
-# Function to process files and query using OpenAI API
-def process_files_and_query(zip_file, query):
-    # Save uploaded ZIP file
-    zip_path = "uploaded.zip"
-    with open(zip_path, "wb") as f:
-        f.write(zip_file.read())
-    # Extract text from PDFs in the ZIP archive
-    corpus = extract_text_from_zip(zip_file)
-    # Split the corpus into chunks
-    chunks = split_text_into_chunks(corpus)
-    # Perform OpenAI API query on each chunk
-    responses = []
-    for chunk in chunks:
-        prompt = chunk + "\nQuery: " + query
-        response = openai.Completion.create(
-            engine="text-davinci-003",
-            prompt=prompt,
-            max_tokens=2048,
-            temperature=0.7,
-            n=1,
-            stop=None,
-            timeout=120,
-        )
-        responses.append(response.choices[0].text.strip())
-    # Combine the responses into a single answer
-    answer = " ".join(responses)
-    return answer
-# Gradio input and output interfaces
-zip_file_input = grc.File(label="Upload ZIP File")
-query_input = grc.Textbox(label="Enter your query")
-output = grc.Textbox(label="Answer")
-# Gradio interface configuration
-iface = gr.Interface(fn=process_files_and_query, inputs=[zip_file_input, query_input], outputs=output, title="PDF Search", description="Upload a ZIP file containing PDFs, enter your query, and get the answer.")
 iface.launch()

 import os
+import io
 import zipfile
+from pdf2image import convert_from_path
+import easyocr
 import gradio as gr
+def convert_pdf_to_text(input_zip):
+    if not input_zip.name.endswith(".zip"):
+        return "Please upload a .zip file."
+    text_contents = ''
+    reader = easyocr.Reader(['en']) # Specify the language(s)
+    with zipfile.ZipFile(input_zip.name, 'r') as zip_ref:
+        for file_name in zip_ref.namelist():
+            if file_name.endswith('.pdf'):
+                pdf_file_path = zip_ref.extract(file_name)
+                # Convert PDF to a list of images
+                images = convert_from_path(pdf_file_path)
+                # Iterate through each image and perform OCR using easyocr
+                for image in images:
+                    result = reader.readtext(image, detail=0)  # detail=0 for only the OCR'd text
+                    text_contents += ' '.join(result)
+                # Clean up the extracted pdf file
+                os.remove(pdf_file_path)
+    return text_contents
+iface = gr.Interface(
+    fn=convert_pdf_to_text,
+    inputs=gr.inputs.File(),
+    outputs="text"
+)
 iface.launch()