Digambar29's picture
Initial clean commit with corrected .gitignore
4c86652
import os
import pdfplumber #type:ignore
from sentence_transformers import SentenceTransformer #type:ignore
import faiss #type:ignore
import numpy as np #type:ignore
from gpt4all import GPT4All #type:ignore
# -------------------------------
# Step 1: Load PDFs and extract text
# -------------------------------
papers_folder = "papers" # folder where you store PDFs
pdf_texts = []
for filename in os.listdir(papers_folder):
if filename.endswith(".pdf"):
path = os.path.join(papers_folder, filename)
with pdfplumber.open(path) as pdf:
text = ""
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
pdf_texts.append(text)
print(f"Loaded {len(pdf_texts)} PDFs.")
# -------------------------------
# Step 2: Chunk texts (optional but recommended)
# -------------------------------
def chunk_text(text, chunk_size=500):
words = text.split()
return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
chunks = []
for text in pdf_texts:
chunks.extend(chunk_text(text))
print(f"Total text chunks: {len(chunks)}")
# -------------------------------
# Step 3: Create embeddings with SentenceTransformers
# -------------------------------
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.encode(chunks, show_progress_bar=True)
# -------------------------------
# Step 4: Setup FAISS index
# -------------------------------
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
print("FAISS index created with all chunks.")
# -------------------------------
# Step 5: Load GPT4All offline model
# -------------------------------
gpt_model = GPT4All(r"C:\Users\ACER\Desktop\Llama-3.2-3B-Instruct-Q4_0.gguf")
print("GPT4All model loaded.")
# -------------------------------
# Step 6: Function to answer questions
# -------------------------------
def ask_question(query, top_k=3):
# Embed the query
query_vec = embedding_model.encode([query])
# Search FAISS for top_k relevant chunks
distances, indices = index.search(np.array(query_vec), top_k)
# Combine top chunks as context
context = "\n".join([chunks[i] for i in indices[0]])
prompt = f"Answer the question using ONLY the information from the following context:\n{context}\n\nQuestion: {query}\nAnswer:"
# Generate response
response = gpt_model.generate(prompt, max_tokens=200, temp=0.7)
return response
# -------------------------------
# Step 7: Interactive loop
# -------------------------------
print("\nResearch-Paper Chatbot (type 'exit' to quit)")
while True:
question = input("\nYour question: ")
if question.lower() in ["exit", "quit"]:
break
answer = ask_question(question)
print("\nAnswer:\n", answer)