Spaces:

Sk4467
/

fastapiapp

Sleeping

App Files Files Community

fastapiapp / file_processing.py

Sk4467

Update file_processing.py

3c4744f verified almost 2 years ago

raw

history blame contribute delete

3.92 kB

	from langchain.document_loaders import PyPDFLoader, PDFMinerLoader, DirectoryLoader
	from langchain.embeddings import SentenceTransformerEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.vectorstores import Chroma
	from os.path import join
	import os
	from dotenv import load_dotenv
	# load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
	openai_api_key = os.environ.get('OPENAI_API_KEY')
	from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader
	from langchain_community.document_loaders import PyMuPDFLoader,TextLoader,CSVLoader,Docx2txtLoader,UnstructuredWordDocumentLoader

	# def load_documents(file_path):
	# if file_path.endswith('.txt'):
	# loader = TextLoader(file_path)
	# elif file_path.endswith('.pdf'):
	# loader = PyPDFLoader(file_path)
	# elif file_path.endswith('.doc') or file_path.endswith('.docx'):
	# loader = UnstructuredWordDocumentLoader(file_path)
	# elif file_path.endswith('.csv'):
	# loader = CSVLoader(file_path)
	# else:
	# raise ValueError(f"Unsupported file format: {file_path}")

	# documents = loader.load()
	# return documents
	from fastapi import UploadFile
	from typing import List
	import fitz # PyMuPDF
	import pandas as pd
	import docx
	import tempfile
	from langchain.docstore.document import Document

	def read_pdf(file_path: str) -> str:
	loader=PyMuPDFLoader(file_path)
	text=loader.load()
	return text

	def read_docx(file_path: str) -> str:
	loader=UnstructuredWordDocumentLoader(file_path)
	text=loader.load()
	return text

	def read_csv(file_path: str) -> str:
	loader=CSVLoader(file_path)
	data=loader.load()
	return data
	def read_txt(file_path: str) -> str:
	loader=TextLoader(file_path)
	text=loader.load()
	return text

	async def load_documents(file: UploadFile) -> List[Document]:
	temp_file_path = f"temp_{file.filename}"
	try:
	# Save the uploaded file to a temporary file
	with open(temp_file_path, "wb") as temp_file:
	contents = await file.read() # Read the content of the uploaded file
	temp_file.write(contents) # Write the content to the temporary file

	# Now you can pass temp_file_path to your read functions
	content = ""
	if file.filename.endswith('.pdf'):
	content = read_pdf(temp_file_path) # Pass the path, not the file object
	elif file.filename.endswith('.docx'):
	content = read_docx(temp_file_path)
	elif file.filename.endswith('.csv'):
	content = read_csv(temp_file_path)
	elif file.filename.endswith('.txt'):
	content = read_txt(temp_file_path)
	else:
	raise ValueError("Unsupported file format")
	except Exception as e:
	print(f"Error processing document: {e}")
	content = "Error processing document."
	finally:
	if os.path.exists(temp_file_path):
	os.remove(temp_file_path) # Clean up the temporary file

	# metadata = {'source': file.filename}
	# document = Document(page_content=content, metadata=metadata)
	return content



	from langchain.text_splitter import CharacterTextSplitter

	def chunk_documents(documents, chunk_size, chunk_overlap):
	text_splitter = CharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)
	chunked_docs = text_splitter.split_documents(documents)
	return chunked_docs


	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import Chroma

	def create_embeddings(chunked_docs, collection_name):
	embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
	vector_store = Chroma.from_documents(chunked_docs, embeddings, collection_name=collection_name)
	vector_store.persist()

	return vector_store