Spaces:

ommore86
/

Legal-Docs-AI

Sleeping

File size: 1,461 Bytes

b4bcb01

import os
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
import PyPDF2

# Set your Hugging Face API token
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

# Load and split PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
        return text

pdf_text = extract_text_from_pdf("IEEEpaper.pdf")

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_text(pdf_text)

from langchain_huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = FAISS.from_texts(chunks, embedding)

# ✅ Use HuggingFaceEndpoint correctly
hf_llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-alpha",
    temperature=0.5,
    max_new_tokens=512,
    huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
)

llm = ChatHuggingFace(llm=hf_llm)

qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever())
response = qa.run("Who is the author?")

# print("Total Chunks:", len(chunks))
# print("First Chunk:", chunks[0] if chunks else "No chunks extracted")