PEERSKILLMATCH / app.py
SumbalFatima1122's picture
Update app.py
2e7c243 verified
import os
import PyPDF2
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np
import streamlit as st
from io import BytesIO
import tempfile
from groq import Groq
import os
os.environ["GROQ_API_KEY"] = "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"
# Groq API setup
groq_api = "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"
client = Groq(api_key=groq_api)
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ''
for page in reader.pages:
text += page.extract_text()
return text
# Chunking function
def chunk_text(text, chunk_size=500):
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
return chunks
# Tokenizing function
def tokenize_text(chunks, model_name="bert-base-uncased"):
tokenizer = AutoTokenizer.from_pretrained(model_name)
embeddings = []
for chunk in chunks:
inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
embeddings.append(inputs['input_ids'])
return embeddings
# Create embeddings from text using a pre-trained model
def create_embeddings(text_chunks, model_name="bert-base-uncased"):
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
embeddings = []
for chunk in text_chunks:
inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy())
return np.array(embeddings)
# FAISS Indexing
def create_faiss_index(embeddings):
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return index
# Function to generate response with Groq API
def generate_response_with_groq(prompt):
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="llama3-8b-8192", # Example model, change if needed
)
return chat_completion.choices[0].message.content
# Streamlit App
st.title("Peer Skill Matcher App")
uploaded_file = st.file_uploader("Upload a PDF Document", type="pdf")
if uploaded_file is not None:
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
tmp_file.write(uploaded_file.read())
tmp_file_path = tmp_file.name
# Extract text from the uploaded PDF
text = extract_text_from_pdf(tmp_file_path)
st.text_area("Extracted Text", text, height=300)
# Chunk and tokenize
text_chunks = chunk_text(text)
embeddings = create_embeddings(text_chunks)
faiss_index = create_faiss_index(embeddings)
st.success("Text processed successfully. Now you can search for peer skills.")
# User input for search query
query = st.text_input("Search for Peer Skills", "")
if query:
# Tokenize and create embedding for the query
query_embedding = create_embeddings([query])
# Perform FAISS search
distances, indices = faiss_index.search(query_embedding, k=3)
st.write("Top 3 Matching Skills:")
for idx, dist in zip(indices[0], distances[0]):
st.write(f"Match {idx}: Distance = {dist:.4f}")