Legal-Docs-AI / pdf_extractor.py
ommore86's picture
initial commit
b4bcb01
import os
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
import PyPDF2
# Set your Hugging Face API token
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
# Load and split PDF
def extract_text_from_pdf(file_path):
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ''
for page in reader.pages:
text += page.extract_text()
return text
pdf_text = extract_text_from_pdf("IEEEpaper.pdf")
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_text(pdf_text)
from langchain_huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_texts(chunks, embedding)
# ✅ Use HuggingFaceEndpoint correctly
hf_llm = HuggingFaceEndpoint(
repo_id="HuggingFaceH4/zephyr-7b-alpha",
temperature=0.5,
max_new_tokens=512,
huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
)
llm = ChatHuggingFace(llm=hf_llm)
qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever())
response = qa.run("Who is the author?")
# print("Total Chunks:", len(chunks))
# print("First Chunk:", chunks[0] if chunks else "No chunks extracted")