Spaces:
Sleeping
Sleeping
File size: 3,410 Bytes
c19e33b 6210ef7 c19e33b 84379ff c19e33b 84379ff c19e33b 84379ff c19e33b 6210ef7 84379ff 6210ef7 84379ff c19e33b 11968ad c19e33b 6210ef7 84379ff 6210ef7 84379ff 6210ef7 84379ff ca303d5 6210ef7 84379ff 6210ef7 84379ff 6210ef7 9f67f88 6210ef7 9f67f88 0b4b1ca 11968ad 6210ef7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
import os
from dotenv import load_dotenv
load_dotenv()
import tempfile
import requests
import streamlit as st
import boto3
from botocore.exceptions import ClientError, NoCredentialsError
def check_pdf(read_file_path):
try:
parser = LlamaParse(result_type="markdown", api_key=os.environ['LLAMA_CLOUD_API_KEY'], ignore_errors=False)
file_extractor = {".pdf": parser}
markdown_data = SimpleDirectoryReader(input_files=[read_file_path], file_extractor=file_extractor).load_data()
if markdown_data == []:
st.error('No markdown data found')
else:
st.success('File Parsed successfully')
except Exception as e:
st.error(f"An error occurred: {e}")
def download_file_from_url(url, filename):
st.markdown(f"Downloading file from {url} to {filename}")
os.makedirs(os.path.dirname(filename), exist_ok=True)
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=1024):
file.write(chunk)
st.markdown(f"File downloaded and saved as {filename}")
else:
st.error(f"Failed to download file. Status code: {response.status_code}")
url = st.text_input("Enter URL", key="url")
if url:
with tempfile.TemporaryDirectory() as temp_dir:
download_file_from_url(url, os.path.join(temp_dir, "task_for_you.pdf"))
check_pdf(os.path.join(temp_dir, "task_for_you.pdf"))
def download_files_from_s3(bucket_name, local_folder, file_path_list):
s3 = boto3.client('s3')
folder_prefix = ''
try:
# List objects in the S3 bucket
paginator = s3.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_prefix)
# Download filtered files
for page in page_iterator:
for obj in page.get('Contents', []):
key = obj['Key']
# Apply file filter if specified
if key not in file_path_list:
continue
# Construct local file path
local_path = os.path.join(local_folder, key)
os.makedirs(os.path.dirname(local_path), exist_ok=True)
try:
st.markdown(f"Downloading: {key} -> {local_path}")
s3.download_file(bucket_name, key, local_path)
st.markdown(f"Downloaded: {local_path}")
except Exception as e:
st.error(f"Error downloading {key}: {e}")
for path in file_path_list:
if not os.path.isfile(os.path.join(local_folder, path)):
st.error(f"Failed to download file {path}")
except NoCredentialsError:
st.error("No AWS credentials found.")
except Exception as e:
st.error(f"An error occurred: {e}")
bucket_name = st.text_input("Enter bucket name", key="bucket_name")
key = st.text_input("Enter key", key="key")
if st.button("Submit"):
with tempfile.TemporaryDirectory() as temp_dir:
download_files_from_s3(bucket_name, temp_dir, [key])
file_name = os.path.join(temp_dir, key)
check_pdf(os.path.join(temp_dir, file_name))
|