Spaces:
Sleeping
Sleeping
| import os | |
| import PyPDF2 | |
| from transformers import AutoTokenizer, AutoModel | |
| import torch | |
| import faiss | |
| import numpy as np | |
| import streamlit as st | |
| from io import BytesIO | |
| import tempfile | |
| from groq import Groq | |
| import os | |
| os.environ["GROQ_API_KEY"] = "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" | |
| # Groq API setup | |
| groq_api = "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" | |
| client = Groq(api_key=groq_api) | |
| # Function to extract text from PDF | |
| def extract_text_from_pdf(pdf_path): | |
| with open(pdf_path, 'rb') as file: | |
| reader = PyPDF2.PdfReader(file) | |
| text = '' | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| # Chunking function | |
| def chunk_text(text, chunk_size=500): | |
| chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] | |
| return chunks | |
| # Tokenizing function | |
| def tokenize_text(chunks, model_name="bert-base-uncased"): | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| embeddings = [] | |
| for chunk in chunks: | |
| inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True) | |
| embeddings.append(inputs['input_ids']) | |
| return embeddings | |
| # Create embeddings from text using a pre-trained model | |
| def create_embeddings(text_chunks, model_name="bert-base-uncased"): | |
| model = AutoModel.from_pretrained(model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| embeddings = [] | |
| for chunk in text_chunks: | |
| inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy()) | |
| return np.array(embeddings) | |
| # FAISS Indexing | |
| def create_faiss_index(embeddings): | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| return index | |
| # Function to generate response with Groq API | |
| def generate_response_with_groq(prompt): | |
| chat_completion = client.chat.completions.create( | |
| messages=[{"role": "user", "content": prompt}], | |
| model="llama3-8b-8192", # Example model, change if needed | |
| ) | |
| return chat_completion.choices[0].message.content | |
| # Streamlit App | |
| st.title("Peer Skill Matcher App") | |
| uploaded_file = st.file_uploader("Upload a PDF Document", type="pdf") | |
| if uploaded_file is not None: | |
| with tempfile.NamedTemporaryFile(delete=False) as tmp_file: | |
| tmp_file.write(uploaded_file.read()) | |
| tmp_file_path = tmp_file.name | |
| # Extract text from the uploaded PDF | |
| text = extract_text_from_pdf(tmp_file_path) | |
| st.text_area("Extracted Text", text, height=300) | |
| # Chunk and tokenize | |
| text_chunks = chunk_text(text) | |
| embeddings = create_embeddings(text_chunks) | |
| faiss_index = create_faiss_index(embeddings) | |
| st.success("Text processed successfully. Now you can search for peer skills.") | |
| # User input for search query | |
| query = st.text_input("Search for Peer Skills", "") | |
| if query: | |
| # Tokenize and create embedding for the query | |
| query_embedding = create_embeddings([query]) | |
| # Perform FAISS search | |
| distances, indices = faiss_index.search(query_embedding, k=3) | |
| st.write("Top 3 Matching Skills:") | |
| for idx, dist in zip(indices[0], distances[0]): | |
| st.write(f"Match {idx}: Distance = {dist:.4f}") |