Spaces:

ommore86
/

Legal-Docs-AI

Sleeping

Legal-Docs-AI / pdf_extractor.py

initial commit

b4bcb01 5 months ago

1.46 kB

	import os
	from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
	from langchain_community.vectorstores import FAISS
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chains import RetrievalQA
	import PyPDF2

	# Set your Hugging Face API token
	os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

	# Load and split PDF
	def extract_text_from_pdf(file_path):
	with open(file_path, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	text = ''
	for page in reader.pages:
	text += page.extract_text()
	return text

	pdf_text = extract_text_from_pdf("IEEEpaper.pdf")

	splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	chunks = splitter.split_text(pdf_text)

	from langchain_huggingface import HuggingFaceEmbeddings
	embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

	vectorstore = FAISS.from_texts(chunks, embedding)

	# ✅ Use HuggingFaceEndpoint correctly
	hf_llm = HuggingFaceEndpoint(
	repo_id="HuggingFaceH4/zephyr-7b-alpha",
	temperature=0.5,
	max_new_tokens=512,
	huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
	)

	llm = ChatHuggingFace(llm=hf_llm)

	qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever())
	response = qa.run("Who is the author?")

	# print("Total Chunks:", len(chunks))
	# print("First Chunk:", chunks[0] if chunks else "No chunks extracted")