Spaces:

cogcorp
/

assignment1

Sleeping

App Files Files Community

cogcorp commited on May 24, 2023

Commit

6e868cd

1 Parent(s): 5c2bb8b

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -158

app.py CHANGED Viewed

@@ -1,171 +1,94 @@
-import urllib.request
-import fitz
-import re
-import numpy as np
-import tensorflow_hub as hub
-import openai
-import gradio as gr
 import os
 import zipfile
-from sklearn.neighbors import NearestNeighbors
-openai.api_key = os.getenv('OpenAPI')
-def download_pdf(url, output_path):
-    urllib.request.urlretrieve(url, output_path)
-def extract_zip(file):
-    with zipfile.ZipFile(file, 'r') as zip_ref:
-        for member in zip_ref.namelist():
-            filename = os.path.basename(member)
-            if filename.endswith('.pdf'):
-                zip_ref.extract(member, 'pdfs')
-def preprocess(text):
-    text = text.replace('\n', ' ')
-    text = re.sub('\s+', ' ', text)
-    return text
-def pdf_to_text(path, start_page=1, end_page=None):
-    doc = fitz.open(path)
-    total_pages = doc.page_count
-    if end_page is None:
-        end_page = total_pages
-    text_list = []
-    for i in range(start_page-1, end_page):
-        text = doc.load_page(i).get_text("text")
-        text = preprocess(text)
-        text_list.append(text)
-    doc.close()
-    return text_list
-def text_to_chunks(texts, word_length=150, start_page=1):
-    text_toks = [t.split(' ') for t in texts]
     chunks = []
-    for idx, words in enumerate(text_toks):
-        for i in range(0, len(words), word_length):
-            chunk = words[i:i+word_length]
-            if (i+word_length) > len(words) and (len(chunk) < word_length) and (
-                len(text_toks) != (idx+1)):
-                text_toks[idx+1] = chunk + text_toks[idx+1]
-                continue
-            chunk = ' '.join(chunk).strip()
-            chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"'
-            chunks.append(chunk)
-    return chunks
-class SemanticSearch:
-    def __init__(self):
-        self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
-        self.fitted = False
-    def fit(self, data, batch=1000, n_neighbors=15):
-        self.data = data
-        self.embeddings = self.get_text_embedding(data, batch=batch)
-        n_neighbors = min(n_neighbors, len(self.embeddings))
-        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
-        self.nn.fit(self.embeddings)
-        self.fitted = True
-    def __call__(self, text, return_data=True):
-        inp_emb = self.use([text])
-        neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
-        if return_data:
-            return [self.data[i] for i in neighbors]
         else:
-            return neighbors
-    def get_text_embedding(self, texts, batch=1000):
-        embeddings = []
-        for i in range(0, len(texts), batch):
-            text_batch = texts[i:(i+batch)]
-            emb_batch = self.use(text_batch)
-            embeddings.append(emb_batch)
-        embeddings = np.vstack(embeddings)
-        return embeddings
-recommender = SemanticSearch()
-def load_recommender(paths, start_page=1):
-    global recommender
-    chunks = []
-    for path in paths:
-        if path.endswith('.pdf'):
-            texts = pdf_to_text(path, start_page=start_page)
-            chunks += text_to_chunks(texts, start_page=start_page)
-    recommender.fit(chunks)
-    return 'Corpus Loaded.'
-def generate_text(messages, engine='gpt-3.5-turbo', max_tokens=2048, temperature=0.8):
-    response = openai.ChatCompletion.create(
-        model=engine,
-        messages=[{"role": "system", "content": "You are a research assistant"},
-             {"role": "user", "content": question}],
-        max_tokens=max_tokens,
-        n=1,
-        temperature=temperature
-     )
-    return response.choices[0].message['content']
-def generate_answer(question):
-    topn_chunks = recommender(question)
-    prompt = "You are a helpful assistant.\n"
-    prompt += "User: " + question + "\n"
-    for c in topn_chunks:
-        prompt += "Assistant: " + c + "\n"
-    answer = generate_text(prompt)
     return answer
-def question_answer(urls, file, question):
-    if urls.strip() == '' and file is None:
-        return '[ERROR]: Both URLs and PDFs are empty. Provide at least one.'
-    paths = []
-    if urls.strip() != '':
-        urls = urls.split(',')  # split the URLs string into a list of URLs
-        for url in urls:
-            download_pdf(url.strip(), 'corpus.pdf')
-            paths.append('corpus.pdf')
-    if file is not None:
-        extract_zip(file.name)  # extract the PDFs from the zip file
-        for pdf_file in os.listdir('pdfs'):
-            paths.append(os.path.join('pdfs', pdf_file))
-    load_recommender(paths)
-    if question.strip() == '':
-        return '[ERROR]: Question field is empty'
-    return generate_answer(question)
-title = 'Cognitive AI Agent - Asks the Expert'
-description = """ This cognitive agent allows you to chat with your PDF files as a single corpus of knowledge.  Add your relevant PDFs to a zip file and upload. 🛑PROOF OF CONCEPT🛑 """
-iface = gr.Interface(
-    fn=question_answer,
-    inputs=[
-        gr.inputs.Textbox(label="Enter PDF URLs here, separated by commas"),
-        gr.inputs.File(label="Upload a zip file containing PDF files"),
-        gr.inputs.Textbox(label="Enter your question here"),
-    ],
-    outputs=gr.outputs.Textbox(label="Generated Answer"),
-    title=title,
-    description=description
-)
-iface.launch()

 import os
 import zipfile
+import openai
+import gradio as gr
+from gradio import components as grc
+# Set up OpenAI API credentials
+openai.api_key = "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"
+# Function to extract text from PDF using OpenAI API
+def extract_text_from_pdf(pdf_path):
+    with open(pdf_path, "rb") as f:
+        pdf_bytes = f.read()
+    response = openai.Completion.create(
+        engine="text-davinci-003",
+        prompt=pdf_bytes.decode("utf-8"),
+        max_tokens=2048,
+        temperature=0.7,
+        n=1,
+        stop=None,
+        timeout=120,
+    )
+    return response.choices[0].text.strip()
+# Function to extract text from multiple PDFs in a ZIP archive
+def extract_text_from_zip(zip_file):
+    corpus = ""
+    with zipfile.ZipFile(zip_file, "r") as zip_ref:
+        for file_name in zip_ref.namelist():
+            if file_name.endswith(".pdf"):
+                extracted_text = extract_text_from_pdf(zip_ref.read(file_name))
+                corpus += extracted_text + "\n"
+    return corpus
+# Function to split text into chunks based on maximum token length
+def split_text_into_chunks(text, max_tokens=2048):
     chunks = []
+    words = text.split()
+    current_chunk = ""
+    for word in words:
+        if len(current_chunk) + len(word) <= max_tokens:
+            current_chunk += word + " "
         else:
+            chunks.append(current_chunk.strip())
+            current_chunk = word + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+# Function to process files and query using OpenAI API
+def process_files_and_query(zip_file, query):
+    # Save uploaded ZIP file
+    zip_path = "uploaded.zip"
+    with open(zip_path, "wb") as f:
+        f.write(zip_file.read())
+    # Extract text from PDFs in the ZIP archive
+    corpus = extract_text_from_zip(zip_file)
+    # Split the corpus into chunks
+    chunks = split_text_into_chunks(corpus)
+    # Perform OpenAI API query on each chunk
+    responses = []
+    for chunk in chunks:
+        prompt = chunk + "\nQuery: " + query
+        response = openai.Completion.create(
+            engine="text-davinci-003",
+            prompt=prompt,
+            max_tokens=2048,
+            temperature=0.7,
+            n=1,
+            stop=None,
+            timeout=120,
+        )
+        responses.append(response.choices[0].text.strip())
+    # Combine the responses into a single answer
+    answer = " ".join(responses)
     return answer
+# Gradio input and output interfaces
+zip_file_input = grc.File(label="Upload ZIP File")
+query_input = grc.Textbox(label="Enter your query")
+output = grc.Textbox(label="Answer")
+# Gradio interface configuration
+iface = gr.Interface(fn=process_files_and_query, inputs=[zip_file_input, query_input], outputs=output, title="PDF Search", description="Upload a ZIP file containing PDFs, enter your query, and get the answer.")
+iface.launch()