Spaces:
Build error
Build error
File size: 8,763 Bytes
3da159d 73de916 9978c4b 538c8b3 73de916 a6d8a05 73de916 78efe2b 73de916 78efe2b 73de916 3da159d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
"""
import os
import requests
import torch
import gradio as gr
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import ServiceContext, SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts.prompts import SimpleInputPrompt
from llama_index.embeddings import LangchainEmbedding
def download_pdf_from_url(url, save_path="/content/Data/input.pdf"):
response = requests.get(url)
if response.status_code == 200:
with open(save_path, 'wb') as file:
file.write(response.content)
print(f"PDF downloaded and saved to {save_path}")
else:
print(f"Failed to download PDF. Status code: {response.status_code}")
def mod(pdf_url):
if not os.path.exists("/Data/"): # /content/Data --> /Data/
os.makedirs("/Data/") # /content/Data --> /Data/
download_pdf_from_url(pdf_url) # /content/Data --> /Data/
documents = SimpleDirectoryReader("/Data/").load_data()
system_prompt = '''You are an expert share market document summarizer specializing in creating concise, comprehensive summaries tailored for professional audiences. Your goal is to summarize pdf which may also include tabular columns, as
accurately as possible based on the instructions and context provided.'''
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")
from huggingface_hub import login
hf_token = os.environ.get('HF_TOKEN')
if not hf_token:
raise ValueError("HF_TOKEN environment variable not found. Please set it in your Space settings.")
login(token=hf_token)
llm = HuggingFaceLLM(
context_window=4096,
max_new_tokens=750,
generate_kwargs={"temperature": 0.5, "do_sample": False},
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
tokenizer_name="mistralai/Mistral-7B-Instruct-v0.1",
model_name="mistralai/Mistral-7B-Instruct-v0.1",
device_map="auto",
model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
)
embed_model = LangchainEmbedding(
HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)
service_context = ServiceContext.from_defaults(
chunk_size=1024,
llm=llm,
embed_model=embed_model
)
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine()
response = query_engine.query('''You are an expert share market document summarizer specializing in creating concise, comprehensive summaries tailored for professional audiences. Your task is to analyze the given document and generate a structured summary in approximately 500 words. Ensure the summary:
Captures all key points, including data, insights, and observations.
Clearly outlines the context, such as the purpose of the document and relevant background information.
Summarizes tabular data and numerical figures effectively, while retaining accuracy and relevance.
Highlights significant trends, comparisons, or impacts mentioned in the document.
Uses formal and precise language suitable for a corporate or academic audience.
The output should be well-organized with clear headings or bullet points where applicable. Avoid omitting any critical information, and focus on maintaining a balance between brevity and detail.''')
return str(response.response)
def func(url):
return mod(url)
iface = gr.Interface(
fn=func,
inputs="text",
outputs=gr.Textbox(
label="Output Summary",
placeholder="The summary will appear here . . .",
lines=10,
interactive=False),
examples=[['https://cdn-sn.samco.in/ec90fa5b637541d3c86fdb86f45d920c.pdf'],
['https://cdn-sn.samco.in/7c8616b72b4aa639c0eda9f44285ab1d.pdf'],
['https://cdn-sn.samco.in/a4b95bc0bdb8361459a8b41bfc0ff317.pdf']],
flagging_options=["Useful", "Mediocre 50-50", "Not Useful"],
description="Flag it for every response and classify it according to what you feel!"
)
iface.launch(share=True, debug=True)
"""
import os
import requests
import torch
import gradio as gr
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts.prompts import SimpleInputPrompt
from llama_index.embeddings import LangchainEmbedding
import fitz # PyMuPDF
# Function to process the PDF directly from URL
def process_pdf_from_url(pdf_url):
response = requests.get(pdf_url)
if response.status_code == 200:
pdf_data = response.content
doc = fitz.open(stream=pdf_data, filetype="pdf")
text = ""
for page in doc:
text += page.get_text("text") # Extract text from each page
return text
else:
print(f"Failed to retrieve PDF. Status code: {response.status_code}")
return ""
def mod(pdf_url):
# Process the PDF directly from URL
document_text = process_pdf_from_url(pdf_url)
if not document_text:
return "Failed to process the PDF."
documents = [document_text] # Just using the text directly
system_prompt = """You are an expert share market document summarizer specializing in creating concise, comprehensive summaries tailored for professional audiences. Your goal is to summarize pdf which may also include tabular columns, as accurately as possible based on the instructions and context provided."""
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")
# Hugging Face Token
from huggingface_hub import login
hf_token = os.environ.get('HF_TOKEN')
if not hf_token:
raise ValueError("HF_TOKEN environment variable not found. Please set it in your Space settings.")
login(token=hf_token)
# Define the LLM and embeddings models
llm = HuggingFaceLLM(
context_window=4096,
max_new_tokens=750,
generate_kwargs={"temperature": 0.5, "do_sample": False},
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
tokenizer_name="mistralai/Mistral-7B-Instruct-v0.1",
model_name="mistralai/Mistral-7B-Instruct-v0.1",
device_map="auto",
model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
)
embed_model = LangchainEmbedding(
HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)
# Create service context and index
service_context = ServiceContext.from_defaults(
chunk_size=1024,
llm=llm,
embed_model=embed_model
)
# Indexing the document
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine()
# Query to generate summary
response = query_engine.query("""You are an expert share market document summarizer specializing in creating concise, comprehensive summaries tailored for professional audiences. Your task is to analyze the given document and generate a structured summary in approximately 500 words. Ensure the summary:
- Captures all key points, including data, insights, and observations.
- Clearly outlines the context, such as the purpose of the document and relevant background information.
- Summarizes tabular data and numerical figures effectively, while retaining accuracy and relevance.
- Highlights significant trends, comparisons, or impacts mentioned in the document.
- Uses formal and precise language suitable for a corporate or academic audience.
The output should be well-organized with clear headings or bullet points where applicable. Avoid omitting any critical information, and focus on maintaining a balance between brevity and detail.""")
return str(response.response)
# Gradio Interface
def func(url):
return mod(url)
iface = gr.Interface(
fn=func,
inputs="text",
outputs=gr.Textbox(
label="Output Summary",
placeholder="The summary will appear here . . .",
lines=10,
interactive=False
),
examples=[
['https://cdn-sn.samco.in/ec90fa5b637541d3c86fdb86f45d920c.pdf'],
['https://cdn-sn.samco.in/7c8616b72b4aa639c0eda9f44285ab1d.pdf'],
['https://cdn-sn.samco.in/a4b95bc0bdb8361459a8b41bfc0ff317.pdf']
],
flagging_options=["Useful", "Mediocre 50-50", "Not Useful"],
description="Flag it for every response and classify it according to what you feel!"
)
iface.launch(share=True, debug=True)
|