Spaces:
Runtime error
Runtime error
| !pip install PyPDF2 | |
| import gradio as gr | |
| import os | |
| import PyPDF2 # Import PyPDF2 for PDF text extraction | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import stopwords | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # Load NLTK resources | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| # Function to extract text from PDFs using PyPDF2 | |
| def extract_text_from_pdf(pdf_path): | |
| pdf_text = "" | |
| with open(pdf_path, 'rb') as pdf_file: | |
| pdf_reader = PyPDF2.PdfFileReader(pdf_file) | |
| for page_num in range(pdf_reader.getNumPages()): | |
| page = pdf_reader.getPage(page_num) | |
| pdf_text += page.extractText() | |
| return pdf_text | |
| # Function to clean and tokenize text | |
| def clean_and_tokenize(text): | |
| tokens = word_tokenize(text.lower()) | |
| tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')] | |
| return ' '.join(tokens) | |
| # Function to preprocess the documents in the specified directory | |
| def preprocess_documents(dataset_dir): | |
| documents = [] | |
| for filename in os.listdir(dataset_dir): | |
| if filename.endswith('.pdf'): | |
| pdf_path = os.path.join(dataset_dir, filename) | |
| pdf_text = extract_text_from_pdf(pdf_path) | |
| clean_text = clean_and_tokenize(pdf_text) | |
| documents.append(clean_text) | |
| return documents | |
| # Function to perform relevance matching and return top N documents | |
| def perform_relevance_matching(query, *uploaded_files, dataset_dir): | |
| # Preprocess the documents in the specified dataset directory | |
| documents = preprocess_documents(dataset_dir) | |
| # Combine the user-uploaded files into a single document | |
| uploaded_documents = [] | |
| for file in uploaded_files: | |
| uploaded_text = extract_text_from_pdf(file.name) | |
| uploaded_documents.append(uploaded_text) | |
| # Combine the uploaded documents and query | |
| combined_documents = uploaded_documents + [query] | |
| # Vectorize the combined documents | |
| tfidf_vectorizer = TfidfVectorizer() | |
| tfidf_matrix = tfidf_vectorizer.fit_transform(documents + combined_documents) | |
| # Calculate cosine similarities between the combined documents and the dataset | |
| cosine_similarities = cosine_similarity(tfidf_matrix[-len(combined_documents):], tfidf_matrix[:-len(combined_documents)]) | |
| # Rank documents by similarity score | |
| document_scores = list(enumerate(cosine_similarities[0])) | |
| sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True) | |
| # Extract the top N relevant documents | |
| top_n = 5 | |
| top_documents = [] | |
| for i in range(min(top_n, len(sorted_documents))): | |
| doc_index, score = sorted_documents[i] | |
| document_text = documents[doc_index][:500] # Extract the first 500 characters of the document | |
| top_documents.append((f"Document {doc_index + 1} (Similarity Score: {score:.4f})", document_text)) | |
| return top_documents | |
| # Create a Gradio interface | |
| iface = gr.Interface( | |
| fn=perform_relevance_matching, | |
| inputs=[ | |
| "text", # Query input | |
| gr.File(multiple=True), # Allow multiple file uploads | |
| "text" # Dataset directory input | |
| ], | |
| outputs=gr.Table(), | |
| live=True, | |
| title="Legal Research Assistant", | |
| description="Enter your legal query, upload files, and specify the dataset directory.", | |
| ) | |
| # Launch the Gradio interface | |
| iface.launch() | |