Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| import json | |
| import networkx as nx | |
| from sentence_transformers import SentenceTransformer, util | |
| import openai | |
| # Model for embeddings | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| # 1. Extract Text from PDF | |
| def extract_text_from_pdf(file_path): | |
| """Extract text from a PDF.""" | |
| text = "" | |
| with open(file_path, "rb") as f: | |
| reader = PyPDF2.PdfReader(f) | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| # 2. Build Hierarchical Tree | |
| def build_hierarchical_tree(text, textbook_title): | |
| """Create a hierarchical tree structure.""" | |
| lines = text.split("\n") | |
| tree = {"title": textbook_title, "chapters": []} | |
| current_chapter = None | |
| for line in lines: | |
| if line.strip().startswith("Chapter"): | |
| current_chapter = {"title": line.strip(), "sections": []} | |
| tree["chapters"].append(current_chapter) | |
| elif current_chapter and line.strip(): | |
| current_chapter["sections"].append(line.strip()) | |
| return tree | |
| def save_tree(tree, path): | |
| """Save the hierarchical tree.""" | |
| with open(path, "w") as f: | |
| json.dump(tree, f, indent=4) | |
| # 3. Hybrid Retrieval | |
| def hybrid_retrieval(query, openai_api_key): | |
| """Retrieve relevant text using hybrid methods.""" | |
| with open("hierarchical_trees/example_tree.json") as f: # Adjust file path as needed | |
| tree = json.load(f) | |
| all_sections = [ | |
| section for chapter in tree["chapters"] for section in chapter["sections"] | |
| ] | |
| query_embedding = model.encode(query, convert_to_tensor=True) | |
| section_embeddings = model.encode(all_sections, convert_to_tensor=True) | |
| similarities = util.pytorch_cos_sim(query_embedding, section_embeddings) | |
| top_indices = similarities[0].topk(3).indices.tolist() | |
| return " ".join([all_sections[i] for i in top_indices]) | |
| # 4. RAG Answer Generation | |
| def rag_answer(query, context, openai_api_key): | |
| """Generate an answer using Retrieval-Augmented Generation.""" | |
| openai.api_key = openai_api_key | |
| response = openai.Completion.create( | |
| engine="text-davinci-003", | |
| prompt=f"Answer the question based on the context below:\n\nContext: {context}\n\nQuestion: {query}\n\nAnswer:", | |
| max_tokens=150, | |
| ) | |
| return response.choices[0].text.strip() | |