flyfir248 commited on
Commit
2f7d8ba
·
1 Parent(s): aa928dd

Commit : Updated rag.html and the routes.py for the /rag route and the requirements txt which has ctransformers and faiss-cpu

Browse files
Files changed (3) hide show
  1. App/routes.py +112 -29
  2. Templates/rag.html +131 -79
  3. requirements.txt +5 -1
App/routes.py CHANGED
@@ -14,6 +14,7 @@ from typing import List, Dict, Optional
14
  import time
15
  from datetime import datetime
16
  import hashlib
 
17
 
18
  # Import existing components
19
  from App.engine.ranker import CPIRanker
@@ -427,46 +428,128 @@ def scholar_discovery():
427
 
428
 
429
  # ====== UPDATED RAG ROUTE (BROWSER-FREE) ======
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
 
431
- @main_bp.route('/rag')
 
 
 
432
  def rag_search():
433
- """Enhanced RAG search using browser-free data aggregation"""
434
- query = request.args.get('q', '').strip()
435
  answer = ""
436
  sources = []
437
 
438
- if query:
439
- # Check for HF token
440
- hf_token = os.getenv('HF_TOKEN')
441
-
442
- if not hf_token:
443
- return render_template('rag.html', query=query,
444
- answer="HF_TOKEN not configured. Please set your Hugging Face API token in environment variables.",
445
- sources=[])
 
 
 
 
 
 
 
 
 
446
 
447
- try:
448
- # Initialize systems
449
- scraper = EnhancedScholarScraper()
450
- rag_system = EnhancedRAGSystem(hf_token)
451
 
452
- # Fetch and index data from multiple sources
453
- corpus = scraper.get_comprehensive_corpus(query)
454
- doc_count = rag_system.index_corpus(corpus)
 
 
 
 
 
 
 
 
455
 
456
- if doc_count == 0:
457
- answer = f"No academic sources found for '{query}'. Try a broader search term or different keywords."
458
- else:
459
- # Generate synthesis
460
- result = rag_system.generate_synthesis(query, k=5)
461
- answer = result['answer']
462
- sources = result['sources']
463
 
464
- except Exception as e:
465
- print(f"RAG Error: {e}")
466
- answer = f"An error occurred while processing your request: {str(e)}"
 
467
 
468
- return render_template('rag.html', query=query, answer=answer, sources=sources)
 
 
 
 
469
 
 
470
 
471
  # ====== EXISTING DISCOVERY ROUTE (PRESERVED) ======
472
 
 
14
  import time
15
  from datetime import datetime
16
  import hashlib
17
+ import PyPDF2
18
 
19
  # Import existing components
20
  from App.engine.ranker import CPIRanker
 
428
 
429
 
430
  # ====== UPDATED RAG ROUTE (BROWSER-FREE) ======
431
+ import os
432
+ from flask import request, render_template
433
+ import PyPDF2
434
+
435
+ from langchain_community.llms import CTransformers
436
+ from langchain_community.embeddings import HuggingFaceEmbeddings
437
+ from langchain_community.vectorstores import FAISS
438
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
439
+
440
+ # Core LCEL imports
441
+ from langchain_core.runnables import RunnablePassthrough
442
+ from langchain_core.output_parsers import StrOutputParser
443
+ from langchain_core.prompts import ChatPromptTemplate
444
+
445
+ # ----------------------------
446
+ # INITIALIZE EMBEDDINGS & LLM
447
+ # ----------------------------
448
+ embeddings = HuggingFaceEmbeddings(
449
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
450
+ )
451
+
452
+ # Use a local GGUF model that supports text-generation
453
+ llm = CTransformers(
454
+ model="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
455
+ model_file="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
456
+ model_type="llama",
457
+ config={
458
+ 'max_new_tokens': 512,
459
+ 'temperature': 0.7,
460
+ 'context_length': 2048,
461
+ 'gpu_layers': 0
462
+ }
463
+ )
464
+
465
+
466
+ # ----------------------------
467
+ # 1. HELPERS
468
+ # ----------------------------
469
+ def format_docs(docs):
470
+ """Combines retrieved document contents into one string."""
471
+ return "\n\n".join(doc.page_content for doc in docs)
472
+
473
+
474
+ def extract_answer(text):
475
+ """
476
+ TinyLlama often returns the full prompt.
477
+ This extracts only the text after the assistant tag.
478
+ """
479
+ marker = "<|assistant|>"
480
+ if marker in text:
481
+ return text.split(marker)[-1].strip().replace("</s>", "")
482
+ return text.strip()
483
+
484
+
485
+ # ----------------------------
486
+ # 2. PROMPT: Optimized for TinyLlama Chat
487
+ # ----------------------------
488
+ # Note: Spacing is critical for TinyLlama to recognize the end of the user block
489
+ prompt = ChatPromptTemplate.from_template(
490
+ "<|system|>\nYou are a concise assistant. Use the provided context to answer the question. If the answer isn't in the context, say you don't know.</s>\n<|user|>\nContext: {context}\nQuestion: {input}</s>\n<|assistant|>\n")
491
+
492
 
493
+ # ----------------------------
494
+ # 3. ROUTE
495
+ # ----------------------------
496
+ @main_bp.route('/rag', methods=['GET', 'POST'])
497
  def rag_search():
498
+ query = ""
 
499
  answer = ""
500
  sources = []
501
 
502
+ if request.method == 'POST':
503
+ query = request.form.get('q', '').strip()
504
+ pdf_file = request.files.get('pdf_file')
505
+
506
+ if pdf_file and pdf_file.filename.endswith('.pdf'):
507
+ try:
508
+ # 1) Read & Split PDF
509
+ reader = PyPDF2.PdfReader(pdf_file)
510
+ full_text = "".join([p.extract_text() or "" for p in reader.pages])
511
+
512
+ # Improved splitting to keep context meaningful
513
+ splitter = RecursiveCharacterTextSplitter(
514
+ chunk_size=500,
515
+ chunk_overlap=50,
516
+ separators=["\n\n", "\n", ".", " "]
517
+ )
518
+ chunks = splitter.split_text(full_text)
519
 
520
+ # 2) Build Local Vector DB
521
+ vector_db = FAISS.from_texts(chunks, embeddings)
522
+ retriever = vector_db.as_retriever(search_kwargs={"k": 3})
 
523
 
524
+ # 3) Build MODERN LCEL Chain with Output Cleaning
525
+ rag_chain = (
526
+ {
527
+ "context": retriever | format_docs,
528
+ "input": RunnablePassthrough()
529
+ }
530
+ | prompt
531
+ | llm
532
+ | StrOutputParser()
533
+ | extract_answer # Cleans the TinyLlama prompt tags
534
+ )
535
 
536
+ if query:
537
+ # 4) Execute Chain
538
+ docs = retriever.invoke(query)
539
+ sources = [doc.page_content for doc in docs]
 
 
 
540
 
541
+ # This now returns only the cleaned string
542
+ answer = rag_chain.invoke(query)
543
+ else:
544
+ answer = "PDF uploaded successfully. What is your question?"
545
 
546
+ except Exception as e:
547
+ print("RAG Error:", e)
548
+ answer = f"System Error: {str(e)}"
549
+ else:
550
+ answer = "Please upload a valid PDF file."
551
 
552
+ return render_template("rag.html", query=query, answer=answer, sources=sources)
553
 
554
  # ====== EXISTING DISCOVERY ROUTE (PRESERVED) ======
555
 
Templates/rag.html CHANGED
@@ -1,103 +1,155 @@
1
  {% extends "base.html" %}
2
 
3
  {% block content %}
4
- <div class="min-h-screen bg-slate-50 p-8">
5
- <div class="max-w-6xl mx-auto">
6
- <div class="mb-12">
7
- <h1 class="text-4xl font-black text-slate-900 mb-2 uppercase italic flex items-center">
8
- <i class="fas fa-microchip text-indigo-600 mr-4"></i>RAG Intelligence
9
- </h1>
10
- <p class="text-slate-500 font-medium mb-8">Retrieval-Augmented Generation for Scientific Synthesis</p>
11
-
12
- <form action="/rag" method="GET" class="relative group">
13
- <input type="text" name="q" value="{{ query }}"
14
- placeholder="Enter research topic for synthesis (e.g., 'T cell exhaustion markers')..."
15
- class="w-full bg-white border-2 border-slate-200 rounded-2xl px-8 py-6 text-lg shadow-sm focus:border-indigo-500 focus:ring-4 focus:ring-indigo-50/50 transition-all outline-none pr-48">
16
- <button type="submit" class="absolute right-4 top-4 bottom-4 bg-indigo-600 text-white px-10 rounded-xl font-bold hover:bg-indigo-700 hover:shadow-lg transition-all active:scale-95">
17
- Synthesize
18
- </button>
19
- </form>
20
-
21
- <div class="mt-4 flex items-center gap-6 text-[10px] font-bold text-slate-400 uppercase tracking-widest">
22
- <div class="flex items-center gap-2">
23
- <span class="w-2 h-2 rounded-full bg-green-500 animate-pulse"></span>
24
- Hugging Face Inference Active
25
- </div>
26
- <div class="flex items-center gap-2">
27
- <i class="fas fa-database text-indigo-400"></i>
28
- In-Memory Vector Store
29
  </div>
 
 
 
 
 
 
 
30
  </div>
31
  </div>
32
 
33
- {% if answer %}
34
- <div class="grid grid-cols-1 lg:grid-cols-12 gap-10 items-start">
35
- <div class="lg:col-span-8 space-y-6">
36
- <div class="bg-white p-10 rounded-[2.5rem] border border-slate-200 shadow-sm relative overflow-hidden">
37
- <div class="absolute top-0 right-0 w-32 h-32 bg-indigo-50/50 rounded-bl-full -mr-16 -mt-16"></div>
 
 
38
 
39
- <div class="flex items-center justify-between mb-8 relative">
40
- <h3 class="text-xs font-black text-indigo-600 uppercase tracking-[0.2em]">Research Synthesis</h3>
41
- <span class="text-[10px] font-bold text-slate-400 bg-slate-100 px-3 py-1 rounded-full">Llama-3-8B</span>
42
- </div>
43
 
44
- <div class="prose prose-indigo max-w-none text-slate-700 text-lg leading-relaxed">
45
- {{ answer | replace('\n', '<br>') | safe }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  </div>
47
 
48
- <div class="mt-10 pt-8 border-t border-slate-100 flex items-center justify-between">
49
- <p class="text-[11px] text-slate-400 font-medium">Verified against top {{ sources|length }} peer-reviewed profiles</p>
50
- <button class="text-indigo-600 font-bold text-xs hover:underline">Export Analysis <i class="fas fa-download ml-1"></i></button>
51
  </div>
52
  </div>
53
  </div>
54
 
55
- <div class="lg:col-span-4 space-y-4">
56
- <div class="flex items-center justify-between mb-2">
57
- <h3 class="text-xs font-black text-slate-400 uppercase tracking-widest">Primary Evidence</h3>
58
- <i class="fas fa-filter text-slate-300 text-xs"></i>
 
 
 
 
59
  </div>
60
 
61
- {% for source in sources %}
62
- <div class="bg-white p-6 rounded-2xl border border-slate-200 shadow-sm group hover:border-indigo-500 hover:shadow-md transition-all relative overflow-hidden">
63
- <div class="absolute top-0 right-0 bg-slate-900 text-white px-3 py-1 text-[9px] font-black italic rounded-bl-xl group-hover:bg-indigo-600 transition-colors">
64
- #{{ loop.index }}
65
- </div>
66
-
67
- <h4 class="font-bold text-slate-900 mb-1 group-hover:text-indigo-600 transition-colors pr-8 leading-tight">
68
- {{ source.name }}
69
- </h4>
70
- <p class="text-[11px] text-slate-500 italic mb-4 line-clamp-1">{{ source.affiliation }}</p>
71
-
72
- <div class="flex items-center justify-between mt-auto">
73
- <a href="{{ source.profile_url }}" target="_blank" class="text-[10px] bg-slate-50 text-slate-600 px-3 py-2 rounded-lg font-bold hover:bg-indigo-50 hover:text-indigo-600 transition-colors">
74
- Full Profile <i class="fas fa-external-link-alt ml-1 opacity-50"></i>
75
- </a>
76
- <div class="text-[10px] font-black text-indigo-400 uppercase">Specialist</div>
77
- </div>
78
- </div>
79
- {% endfor %}
80
-
81
- <div class="bg-indigo-600 p-6 rounded-2xl text-white shadow-lg shadow-indigo-200">
82
- <p class="text-xs font-bold opacity-80 mb-2 uppercase tracking-widest">Methodology</p>
83
- <h5 class="text-sm font-black italic mb-4">Composite Performance Index</h5>
84
- <p class="text-[10px] leading-relaxed opacity-90">
85
- This synthesis is generated by embedding researcher profiles into a vector space and retrieving context based on topical density.
86
- </p>
87
- </div>
88
- </div>
89
- </div>
90
- {% elif query %}
91
- <div class="bg-white border-2 border-dashed border-slate-200 rounded-[3rem] py-32 text-center">
92
- <div class="max-w-xs mx-auto">
93
- <div class="w-16 h-16 bg-slate-100 rounded-full flex items-center justify-center mx-auto mb-6 text-slate-300">
94
- <i class="fas fa-search text-2xl"></i>
95
  </div>
96
- <h3 class="text-xl font-bold text-slate-800 mb-2">Insufficient context found</h3>
97
- <p class="text-slate-400 text-sm">Our crawler could not verify enough specialist profiles for "{{ query }}". Try a broader scientific term.</p>
98
  </div>
99
  </div>
100
  {% endif %}
101
  </div>
102
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  {% endblock %}
 
1
  {% extends "base.html" %}
2
 
3
  {% block content %}
4
+ <div class="min-h-screen bg-[#f8fafc] p-8 font-sans">
5
+ <div class="max-w-7xl mx-auto">
6
+ <div class="mb-10 flex items-center justify-between">
7
+ <div class="flex items-center gap-3">
8
+ <div class="bg-blue-600 p-2 rounded-lg">
9
+ <i class="fas fa-microscope text-white text-xl"></i>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  </div>
11
+ <h1 class="text-2xl font-bold text-[#1e293b] tracking-tight">Qsearch Intelligence</h1>
12
+ </div>
13
+ <div class="flex gap-6 text-sm font-medium text-slate-500">
14
+ <span class="hover:text-blue-600 cursor-pointer flex items-center gap-2"><i class="fas fa-database"></i> Artifacts</span>
15
+ <span class="hover:text-blue-600 cursor-pointer flex items-center gap-2"><i class="fas fa-file-alt"></i> Publications</span>
16
+ <span class="hover:text-blue-600 cursor-pointer flex items-center gap-2"><i class="fas fa-graduation-cap"></i> Scholar</span>
17
+ <button class="bg-slate-100 px-4 py-2 rounded-md text-slate-700 hover:bg-slate-200 transition-all">Analysis Tools <i class="fas fa-chevron-down ml-1 text-xs"></i></button>
18
  </div>
19
  </div>
20
 
21
+ <div class="bg-white rounded-3xl shadow-sm border border-slate-200 p-2 mb-10">
22
+ <form action="/rag" method="POST" enctype="multipart/form-data" onsubmit="return showLoading()" class="flex items-center gap-2">
23
+ <label class="flex items-center justify-center px-6 py-4 border-r border-slate-100 cursor-pointer hover:bg-slate-50 transition-colors">
24
+ <i class="fas fa-paperclip text-slate-400 mr-2"></i>
25
+ <span id="file-name" class="text-sm font-semibold text-slate-500 truncate max-w-[120px]">Upload PDF</span>
26
+ <input type="file" name="pdf_file" id="pdf_file" accept=".pdf" class="hidden" onchange="updateFileName(this)" required />
27
+ </label>
28
 
29
+ <input type="text" name="q" id="query-input" placeholder="Ask anything about your document..."
30
+ class="flex-1 px-6 py-4 text-slate-700 outline-none placeholder:text-slate-400" required>
 
 
31
 
32
+ <button type="submit" id="submit-btn" class="bg-blue-600 text-white px-8 py-4 rounded-2xl font-bold hover:bg-blue-700 transition-all flex items-center gap-3 mr-2">
33
+ <span id="btn-text">Analyze</span>
34
+ <div id="loading-spinner" class="hidden animate-spin h-4 w-4 border-2 border-white border-t-transparent rounded-full"></div>
35
+ </button>
36
+ </form>
37
+ </div>
38
+
39
+ {% if answer %}
40
+ <div id="results-area" class="grid grid-cols-1 lg:grid-cols-12 gap-8 animate-fade-in">
41
+ <div class="lg:col-span-8">
42
+ <div class="bg-white p-10 rounded-[2rem] border border-slate-200 shadow-sm min-h-[400px]">
43
+ <div class="flex items-center justify-between mb-8">
44
+ <h3 class="text-[11px] font-black text-indigo-500 uppercase tracking-[0.2em]">Generated Answer</h3>
45
+ <button onclick="copyAnswer()" class="flex items-center gap-2 text-xs font-bold text-slate-400 hover:text-slate-600 bg-slate-50 px-3 py-1.5 rounded-md transition-all">
46
+ <i class="fas fa-copy"></i> Copy
47
+ </button>
48
  </div>
49
 
50
+ <div id="answer-content" class="text-[#334155] text-lg leading-relaxed prose prose-slate max-w-none">
51
+ {{ answer | safe }}
 
52
  </div>
53
  </div>
54
  </div>
55
 
56
+ <div class="lg:col-span-4 space-y-6">
57
+ <div class="flex items-center justify-between px-2">
58
+ <h3 class="text-[11px] font-black text-slate-400 uppercase tracking-[0.2em]">Supporting Context</h3>
59
+ {% if sources %}
60
+ <span class="text-[10px] bg-indigo-50 text-indigo-600 px-2.5 py-1 rounded-full font-bold">
61
+ {{ sources|length }} snippets
62
+ </span>
63
+ {% endif %}
64
  </div>
65
 
66
+ <div class="space-y-4">
67
+ {% if sources %}
68
+ {% for source in sources %}
69
+ <div class="bg-white p-6 rounded-2xl border border-slate-100 shadow-sm hover:shadow-md transition-all group relative">
70
+ <div class="flex items-center justify-between mb-4">
71
+ <span class="text-[10px] font-black text-slate-300 uppercase tracking-widest">Chunk {{ loop.index }}</span>
72
+ <button onclick="copySnippet({{ loop.index - 1 }})" class="text-slate-300 hover:text-indigo-500 transition-colors">
73
+ <i class="fas fa-copy text-xs"></i>
74
+ </button>
75
+ </div>
76
+ <p class="text-[13px] text-slate-500 leading-relaxed snippet-text italic" data-full-text="{{ source }}">
77
+ "{{ source[:220] }}{% if source|length > 220 %}...{% endif %}"
78
+ </p>
79
+ {% if source|length > 220 %}
80
+ <button onclick="toggleSnippet({{ loop.index - 1 }})" class="text-[10px] text-indigo-500 font-bold mt-3 uppercase tracking-wider hover:underline">
81
+ Show More
82
+ </button>
83
+ {% endif %}
84
+ </div>
85
+ {% endfor %}
86
+ {% else %}
87
+ <div class="text-center py-10">
88
+ <i class="fas fa-quote-right text-slate-200 text-3xl mb-3"></i>
89
+ <p class="text-sm text-slate-400 italic">No snippets found.</p>
90
+ </div>
91
+ {% endif %}
 
 
 
 
 
 
 
 
92
  </div>
 
 
93
  </div>
94
  </div>
95
  {% endif %}
96
  </div>
97
  </div>
98
+
99
+ <script>
100
+ function updateFileName(input) {
101
+ const fileName = document.getElementById('file-name');
102
+ if (input.files && input.files[0]) {
103
+ fileName.textContent = input.files[0].name;
104
+ fileName.classList.remove('text-slate-500');
105
+ fileName.classList.add('text-blue-600');
106
+ }
107
+ }
108
+
109
+ function showLoading() {
110
+ const btnText = document.getElementById('btn-text');
111
+ const spinner = document.getElementById('loading-spinner');
112
+ const btn = document.getElementById('submit-btn');
113
+
114
+ btn.disabled = true;
115
+ btn.classList.add('opacity-80');
116
+ btnText.textContent = 'Analyzing...';
117
+ spinner.classList.remove('hidden');
118
+ return true;
119
+ }
120
+
121
+ function copyAnswer() {
122
+ const text = document.getElementById('answer-content').innerText;
123
+ navigator.clipboard.writeText(text);
124
+ alert('Answer copied!');
125
+ }
126
+
127
+ function toggleSnippet(index) {
128
+ const snippets = document.querySelectorAll('.snippet-text');
129
+ const buttons = document.querySelectorAll('.snippet-text + button');
130
+ const snippet = snippets[index];
131
+ const button = buttons[index];
132
+ const fullText = snippet.getAttribute('data-full-text');
133
+
134
+ if (snippet.classList.contains('expanded')) {
135
+ snippet.textContent = `"${fullText.substring(0, 220)}..."`;
136
+ snippet.classList.remove('expanded');
137
+ button.textContent = 'Show More';
138
+ } else {
139
+ snippet.textContent = `"${fullText}"`;
140
+ snippet.classList.add('expanded');
141
+ button.textContent = 'Show Less';
142
+ }
143
+ }
144
+ </script>
145
+
146
+ <style>
147
+ @keyframes fade-in {
148
+ from { opacity: 0; transform: translateY(20px); }
149
+ to { opacity: 1; transform: translateY(0); }
150
+ }
151
+ .animate-fade-in { animation: fade-in 0.6s cubic-bezier(0.22, 1, 0.36, 1) forwards; }
152
+ body { background-color: #f8fafc; }
153
+ .prose p { margin-bottom: 1.5rem; }
154
+ </style>
155
  {% endblock %}
requirements.txt CHANGED
@@ -22,7 +22,11 @@ Bio
22
  langchain-huggingface
23
  langchain-core
24
  huggingface-hub
 
25
 
26
  scholarly
27
  feedparser
28
- python-dateutil
 
 
 
 
22
  langchain-huggingface
23
  langchain-core
24
  huggingface-hub
25
+ PyPDF2
26
 
27
  scholarly
28
  feedparser
29
+ python-dateutil
30
+
31
+ ctransformers
32
+ faiss-cpu