Spaces:

dindizz
/

sabhascholar

Sleeping

App Files Files Community

dindizz commited on Sep 14, 2024

Commit

96a45e3

verified ·

1 Parent(s): e63b931

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -17

app.py CHANGED Viewed

@@ -1,41 +1,59 @@
 import openai
 import gradio as gr
-import pandas as pd
-import os  # Importing os to access environment variables
 from datasets import load_dataset
-# Load the dataset from Hugging Face
-dataset = load_dataset('https://huggingface.co/spaces/dindizz/musicacademyarchives')
 # Access the OpenAI API key from environment variables (Hugging Face secret)
 openai.api_key = os.getenv('OPENAI_API_KEY')
 def extract_info(query):
     """
     This function interacts with OpenAI GPT-3.5 Turbo to extract information from the dataset based on the user's query.
     """
-    # Extracting the text content from the dataset to pass as context
-    all_souvenirs = []
     for item in dataset['train']:
-        souvenir_text = item['text']  # Assuming the column name is 'text' containing the content
-        all_souvenirs.append(souvenir_text)
-    # Combine the content into a single string (you can adjust based on the size of the dataset)
-    combined_souvenir_text = "\n".join(all_souvenirs)
-    # Prompt OpenAI GPT-3.5 with the user's query and the combined text
-    prompt = f"Extract relevant information based on the following query: '{query}' from the Madras Music Academy Souvenir archives: {combined_souvenir_text[:2000]}"  # limiting the length for performance
     response = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo",  # Updated model
         messages=[
-            {"role": "system", "content": "You are an assistant that extracts information from the Madras Music Academy Souvenir dataset and present in a friendly tone ."},
             {"role": "user", "content": prompt}
         ],
         max_tokens=300
     )
-    # Returning the answer from OpenAI GPT-3.5 Turbo
     answer = response['choices'][0]['message']['content']
     return answer.strip()
@@ -49,7 +67,7 @@ iface = gr.Interface(
     inputs="text",
     outputs="text",
     title="Sabha Scholar - Madras Music Academy AI Explorer",
-    description="Ask questions about the Madras Music Academy Souvenirs and extract information using OpenAI GPT-3.5 Turbo."
 )
 iface.launch()

 import openai
 import gradio as gr
+import os
 from datasets import load_dataset
+from pdf2image import convert_from_path
+import pytesseract
+from PIL import Image
 # Access the OpenAI API key from environment variables (Hugging Face secret)
 openai.api_key = os.getenv('OPENAI_API_KEY')
+# Function to convert PDF to images and apply OCR
+def pdf_to_text(pdf_path):
+    """
+    Converts PDF pages to images and extracts text using OCR.
+    """
+    images = convert_from_path(pdf_path)
+    full_text = ""
+    for image in images:
+        # Perform OCR on each image
+        text = pytesseract.image_to_string(image)
+        full_text += text + "\n"
+    return full_text
+# Load the dataset from Hugging Face (adjust to point to your dataset)
+dataset = load_dataset('dindizz/musicacademyarchives')
 def extract_info(query):
     """
     This function interacts with OpenAI GPT-3.5 Turbo to extract information from the dataset based on the user's query.
     """
+    all_texts = []
+    # Loop through the PDF files in the dataset
     for item in dataset['train']:
+        pdf_path = item['file']  # Adjust based on the dataset structure
+        pdf_text = pdf_to_text(pdf_path)
+        all_texts.append(pdf_text)
+    combined_text = "\n".join(all_texts)
+    # Send combined text and query to OpenAI for extraction
+    prompt = f"Extract relevant information based on the following query: '{query}' from the Madras Music Academy Souvenir archives: {combined_text[:2000]}"
     response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
         messages=[
+            {"role": "system", "content": "You are an assistant that extracts information from PDF files using OCR."},
             {"role": "user", "content": prompt}
         ],
         max_tokens=300
     )
+    # Return the answer from OpenAI GPT-3.5
     answer = response['choices'][0]['message']['content']
     return answer.strip()
     inputs="text",
     outputs="text",
     title="Sabha Scholar - Madras Music Academy AI Explorer",
+    description="Ask questions about the Madras Music Academy Souvenirs. Extract information using OCR and OpenAI GPT-3.5 Turbo."
 )
 iface.launch()