anshumanpatil commited on
Commit
0bde17f
·
1 Parent(s): afeb732

python ver changed annd code

Browse files
Files changed (3) hide show
  1. app.py +59 -54
  2. old_app.py +84 -0
  3. requirements.txt +17 -56
app.py CHANGED
@@ -1,84 +1,89 @@
1
- import gradio as gr
2
- import random
3
- import time
 
4
  from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
5
  from langchain_community.vectorstores import FAISS
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain.schema import Document
9
  from sentence_transformers import SentenceTransformer
10
- from langchain_community.document_loaders import DirectoryLoader, TextLoader
11
 
12
- docs = []
13
- db = None
14
-
15
- def extract_text(folder_path):
16
- loader = DirectoryLoader(
17
- path=folder_path,
18
- glob="*.txt",
19
- loader_cls=TextLoader,
20
- recursive=True
21
- )
22
- documents = loader.load()
23
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
24
- chunks = text_splitter.split_documents(documents)
25
- db = build_faiss(docs)
26
- return db
27
-
28
- extract_text("msci")
29
 
 
 
 
 
30
  def load_model():
31
  model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
32
  tokenizer = AutoTokenizer.from_pretrained(model_name)
33
  model = AutoModelForCausalLM.from_pretrained(model_name)
34
  return pipeline("text-generation", model=model, tokenizer=tokenizer)
35
 
36
- def build_faiss(_docs):
37
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
38
- return FAISS.from_documents(_docs, embeddings)
39
 
40
- with gr.Blocks() as demo:
41
- gr.Markdown(
42
- """
43
- # Hello Everyone!
44
- Ask questions about document which link is given below .
45
- """)
46
- gr.HTML(f"""<a href='https://www.msci.com/indexes#featured-indexes'> MSCI Indexes .</a>""")
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- chatbot = gr.Chatbot(type="messages", height=220, label="MSCI Chatbot")
50
- msg = gr.Textbox()
51
- clear = gr.Button("Clear", variant="secondary")
 
 
 
 
52
 
53
- def user(user_message, history: list):
54
- return "", history + [{"role": "user", "content": user_message}]
 
 
 
 
 
 
 
55
 
56
- def bot(history: list):
57
- bot_message = getMessage() + "..."
58
- history.append({"role": "assistant", "content": ""})
59
- for character in bot_message:
60
- history[-1]['content'] += character
61
- time.sleep(0.05)
62
- yield history
63
 
64
- msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
65
- bot, chatbot, chatbot
66
- )
67
- clear.click(lambda: None, None, chatbot, queue=False)
68
 
69
- def getMessage():
70
- query = msg.value
71
- retriever = db.as_retriever(search_kwargs={"k": 3})
72
- retrieved_docs = retriever.get_relevant_documents(query)
73
- context = "\n".join([doc.page_content for doc in retrieved_docs])
74
  result = generator(
75
  f"Context:\n{context}\n\nQuestion: {query}\nAnswer:",
76
  max_new_tokens=150,
77
  temperature=0.5,
78
  top_p=0.9
79
  )
 
 
80
  generated = result[0]["generated_text"]
81
  answer_only = generated.split("Answer:")[-1].strip()
82
- return answer_only
83
 
84
- demo.launch(share=True)
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import pypdf
4
+ import docx2txt
5
  from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
6
  from langchain_community.vectorstores import FAISS
7
  from langchain_community.embeddings import HuggingFaceEmbeddings
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain.schema import Document
10
  from sentence_transformers import SentenceTransformer
 
11
 
12
+ # ------------------------------
13
+ # Title
14
+ # ------------------------------
15
+ st.title("📚 RAG Chatbot with TinyLlama")
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # ------------------------------
18
+ # Load TinyLlama
19
+ # ------------------------------
20
+ @st.cache_resource
21
  def load_model():
22
  model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
23
  tokenizer = AutoTokenizer.from_pretrained(model_name)
24
  model = AutoModelForCausalLM.from_pretrained(model_name)
25
  return pipeline("text-generation", model=model, tokenizer=tokenizer)
26
 
27
+ with st.spinner("🔄 Loading TinyLlama..."):
28
+ generator = load_model()
 
29
 
30
+ # ------------------------------
31
+ # File Upload
32
+ # ------------------------------
33
+ uploaded_file = st.file_uploader("📂 Upload a file (PDF, DOCX, CSV)", type=["pdf", "docx", "csv"])
 
 
 
34
 
35
+ # ------------------------------
36
+ # Extract Text
37
+ # ------------------------------
38
+ def extract_text(file):
39
+ if file.type == "application/pdf":
40
+ pdf_reader = pypdf.PdfReader(file)
41
+ return "\n".join([page.extract_text() for page in pdf_reader.pages if page.extract_text()])
42
+ elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
43
+ return docx2txt.process(file)
44
+ elif file.type == "text/csv":
45
+ df = pd.read_csv(file)
46
+ return df.to_string(index=False)
47
+ return ""
48
 
49
+ # ------------------------------
50
+ # Build FAISS Index
51
+ # ------------------------------
52
+ @st.cache_resource
53
+ def build_faiss(_docs):
54
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
55
+ return FAISS.from_documents(_docs, embeddings)
56
 
57
+ docs = []
58
+ db = None
59
+ if uploaded_file:
60
+ text = extract_text(uploaded_file)
61
+ if text:
62
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
63
+ docs = [Document(page_content=chunk) for chunk in splitter.split_text(text)]
64
+ db = build_faiss(docs)
65
+ st.success("✅ Knowledge Base ready!")
66
 
67
+ # ------------------------------
68
+ # Chat
69
+ # ------------------------------
70
+ query = st.text_input("💬 Ask a question about the uploaded document:")
 
 
 
71
 
72
+ if query and db:
73
+ retriever = db.as_retriever(search_kwargs={"k": 3})
74
+ retrieved_docs = retriever.get_relevant_documents(query)
75
+ context = "\n".join([doc.page_content for doc in retrieved_docs])
76
 
77
+ with st.spinner("🤔 Generating answer..."):
 
 
 
 
78
  result = generator(
79
  f"Context:\n{context}\n\nQuestion: {query}\nAnswer:",
80
  max_new_tokens=150,
81
  temperature=0.5,
82
  top_p=0.9
83
  )
84
+
85
+ # Extract only what comes after "Answer:"
86
  generated = result[0]["generated_text"]
87
  answer_only = generated.split("Answer:")[-1].strip()
 
88
 
89
+ st.write("📝 Answer:", answer_only)
old_app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import random
3
+ import time
4
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain.schema import Document
9
+ from sentence_transformers import SentenceTransformer
10
+ from langchain_community.document_loaders import DirectoryLoader, TextLoader
11
+
12
+ docs = []
13
+ db = None
14
+
15
+ def extract_text(folder_path):
16
+ loader = DirectoryLoader(
17
+ path=folder_path,
18
+ glob="*.txt",
19
+ loader_cls=TextLoader,
20
+ recursive=True
21
+ )
22
+ documents = loader.load()
23
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
24
+ chunks = text_splitter.split_documents(documents)
25
+ db = build_faiss(docs)
26
+ return db
27
+
28
+ extract_text("msci")
29
+
30
+ def load_model():
31
+ model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
32
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
33
+ model = AutoModelForCausalLM.from_pretrained(model_name)
34
+ return pipeline("text-generation", model=model, tokenizer=tokenizer)
35
+
36
+ def build_faiss(_docs):
37
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
38
+ return FAISS.from_documents(_docs, embeddings)
39
+
40
+ with gr.Blocks() as demo:
41
+ gr.Markdown(
42
+ """
43
+ # Hello Everyone!
44
+ Ask questions about document which link is given below .
45
+ """)
46
+ gr.HTML(f"""<a href='https://www.msci.com/indexes#featured-indexes'> MSCI Indexes .</a>""")
47
+
48
+
49
+ chatbot = gr.Chatbot(type="messages", height=220, label="MSCI Chatbot")
50
+ msg = gr.Textbox()
51
+ clear = gr.Button("Clear", variant="secondary")
52
+
53
+ def user(user_message, history: list):
54
+ return "", history + [{"role": "user", "content": user_message}]
55
+
56
+ def bot(history: list):
57
+ bot_message = getMessage() + "..."
58
+ history.append({"role": "assistant", "content": ""})
59
+ for character in bot_message:
60
+ history[-1]['content'] += character
61
+ time.sleep(0.05)
62
+ yield history
63
+
64
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
65
+ bot, chatbot, chatbot
66
+ )
67
+ clear.click(lambda: None, None, chatbot, queue=False)
68
+
69
+ def getMessage():
70
+ query = msg.value
71
+ retriever = db.as_retriever(search_kwargs={"k": 3})
72
+ retrieved_docs = retriever.get_relevant_documents(query)
73
+ context = "\n".join([doc.page_content for doc in retrieved_docs])
74
+ result = generator(
75
+ f"Context:\n{context}\n\nQuestion: {query}\nAnswer:",
76
+ max_new_tokens=150,
77
+ temperature=0.5,
78
+ top_p=0.9
79
+ )
80
+ generated = result[0]["generated_text"]
81
+ answer_only = generated.split("Answer:")[-1].strip()
82
+ return answer_only
83
+
84
+ demo.launch(share=True)
requirements.txt CHANGED
@@ -1,56 +1,17 @@
1
- aiofiles==24.1.0
2
- annotated-types==0.7.0
3
- anyio==4.10.0
4
- audioop-lts==0.2.2
5
- Brotli==1.1.0
6
- certifi==2025.8.3
7
- charset-normalizer==3.4.3
8
- click==8.2.1
9
- colorama==0.4.6
10
- fastapi==0.116.1
11
- ffmpy==0.6.1
12
- filelock==3.19.1
13
- fsspec==2025.7.0
14
- gradio==5.43.1
15
- gradio_client==1.12.1
16
- groovy==0.1.2
17
- h11==0.16.0
18
- httpcore==1.0.9
19
- httpx==0.28.1
20
- huggingface-hub==0.34.4
21
- idna==3.10
22
- Jinja2==3.1.6
23
- markdown-it-py==4.0.0
24
- MarkupSafe==3.0.2
25
- mdurl==0.1.2
26
- numpy==2.3.2
27
- orjson==3.11.2
28
- packaging==25.0
29
- pandas==2.3.2
30
- pillow==11.3.0
31
- pydantic==2.11.7
32
- pydantic_core==2.33.2
33
- pydub==0.25.1
34
- Pygments==2.19.2
35
- python-dateutil==2.9.0.post0
36
- python-multipart==0.0.20
37
- pytz==2025.2
38
- PyYAML==6.0.2
39
- requests==2.32.5
40
- rich==14.1.0
41
- ruff==0.12.9
42
- safehttpx==0.1.6
43
- semantic-version==2.10.0
44
- shellingham==1.5.4
45
- six==1.17.0
46
- sniffio==1.3.1
47
- starlette==0.47.2
48
- tomlkit==0.13.3
49
- tqdm==4.67.1
50
- typer==0.16.1
51
- typing-inspection==0.4.1
52
- typing_extensions==4.14.1
53
- tzdata==2025.2
54
- urllib3==2.5.0
55
- uvicorn==0.35.0
56
- websockets==15.0.1
 
1
+ streamlit==1.48.1
2
+ pandas>=2.2.2
3
+ torch>=2.4.1
4
+ transformers==4.43.3
5
+ langchain>=0.3.3
6
+ langchain-community>=0.3.3
7
+ faiss-cpu>=1.8.0
8
+ pypdf>=3.12.0
9
+ docx2txt>=0.8
10
+ sentencepiece>=0.2.0
11
+ huggingface-hub>=0.23.0
12
+ scikit-learn>=1.5.0
13
+ numpy>=1.26.4
14
+ requests>=2.32.3
15
+ sentence-transformers>=2.3.0
16
+ langchain-huggingface>=0.0.3
17
+ accelerate>=0.34.2