Spaces:

StefanoDUrso
/

ELI-chatbot

Sleeping

App Files Files Community

StefanoDUrso commited on Apr 17, 2025

Commit

6e5b27a

0 Parent(s):

first commit

Browse files

Files changed (18) hide show

.gitignore +10 -0
__pycache__/config.cpython-312.pyc +0 -0
app.backup +126 -0
app.py +218 -0
config.py +72 -0
data/txt/Key statisitcs startups.txt +67 -0
data/txt/Risk and Return part 1 v1.txt +63 -0
data/txt/Risk and Return part 2 v1.txt +123 -0
data/txt/VL Myths about entrepreneurs.txt +83 -0
readme.md +36 -0
requirements.txt +110 -0
utilities/llm/LlmManager.py +212 -0
utilities/llm/__pycache__/LlmManager.cpython-312.pyc +0 -0
utilities/vectorstore/QdrantLangchainManager.py +370 -0
utilities/vectorstore/SummaryManager.py +440 -0
utilities/vectorstore/__pycache__/QdrantLangchainManager.cpython-312.pyc +0 -0
utilities/vectorstore/__pycache__/SummaryManager.cpython-312.pyc +0 -0
utils.py +102 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+.env
+.*
+data/mp4
+data/mp3
+# Except .gitignore itself
+!.gitignore
+# Except .gitkeep files
+!.gitkeep

__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (2.47 kB). View file

app.backup ADDED Viewed

	@@ -0,0 +1,126 @@

+import sys
+import time
+import gradio as gr
+from config import initialize, check_user
+llm_manager, qdrant_manager = initialize()
+if llm_manager is None:
+    print("Error: Failed to initialize configuration: llm_manager. Exiting application.", flush=True)
+    sys.exit(1)
+if qdrant_manager is None:
+    print("Error: Failed to initialize configuration: qdrant_manager. Exiting application.", flush=True)
+    sys.exit(1)
+def reset_textbox():
+    """Clears the textbox after sending a message."""
+    return gr.update(value="")
+def slow_echo(message, history):
+    if history is None:
+        history = []  # Ensure history is initialized
+    # Append user message with role "user"
+    history.append({"role": "user", "content": message})
+    # Placeholder for assistant response
+    bot_entry = {"role": "assistant", "content": ""}
+    history.append(bot_entry)
+    response = "You typed: "
+    for i in range(len(message)):
+        time.sleep(0.05)
+        response += message[i]
+        bot_entry["content"] = response  # Update assistant's response progressively
+        yield history  # Yield updated history in the correct format
+    yield history  # Final yield with full message
+def llm_send_message(message, history):
+    if history is None:
+        history = []
+    # Append user message to history
+    history.append({"role": "user", "content": message})
+    yield history
+    # Placeholder for assistant response
+    bot_entry = {"role": "assistant", "content": ""}
+    history.append(bot_entry)
+    # Send message to LLM and stream response
+    response = ""
+    for chunk in llm_manager.send_message(message):  # Streaming response
+        time.sleep(0.01)  # Simulate gradual output
+        response += chunk
+        bot_entry["content"] = response  # Update assistant response progressively
+        yield history  # Yield updated history
+    yield history  # Final yield
+def authenticate(username, password):
+    if check_user(username, password):
+        print("🔑 Login successful!")
+        return gr.update(visible=False), gr.update(visible=True), gr.update(value="", visible=False)  # Hide login, show chatbot, clear error
+    else:
+        print("❌ Incorrect username or password")
+        return gr.update(visible=True), gr.update(visible=True), gr.update(value="❌ Incorrect username or password", visible=True)  # Show error
+with gr.Blocks(fill_height=True) as demo:
+    with gr.Column(visible=True) as login_section:
+        gr.Markdown("### 🔒 Login Required")
+        username_input = gr.Textbox(label="Username")
+        password_input = gr.Textbox(label="Password", type="password")
+        login_button = gr.Button("Login")
+        error_message = gr.Text("", visible=False)
+    with gr.Column(visible=False) as chat_section:
+        chat_configuration = gr.Markdown("")
+        chat = gr.Chatbot(
+            label="Video Helper",
+            type="messages"
+        )
+        input = gr.Textbox(
+            label="Input",
+            placeholder="Type something here..."
+        )
+        stored_message = gr.State()
+        input.submit(
+            fn=lambda text: (text, ""),
+            inputs=[input],
+            outputs=[stored_message, input]
+        ).then(
+            #fn=llm_send_message,
+            fn=slow_echo,
+            inputs=[stored_message, chat],
+            outputs=chat
+        )
+        send_btn = gr.Button("Send")
+        send_btn.click(
+            fn=lambda text: (text, ""),
+            inputs=[input],
+            outputs=[stored_message, input]
+        ).then(
+            fn=llm_send_message,
+            inputs=[stored_message, chat],
+            outputs=chat
+        )
+    login_button.click(
+        authenticate,
+        [username_input, password_input],
+        [login_section, chat_section, error_message]
+    )
+if __name__ == "__main__":
+    demo.launch()

app.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import sys
+import time
+import gradio as gr
+from config import initialize, check_user
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+llm_manager, qdrant_manager = initialize()
+if llm_manager is None or qdrant_manager is None:
+    print("❌ Failed to initialize. Exiting.")
+    sys.exit(1)
+collection_name = "key_statistics"
+if qdrant_manager.get_collection(collection_name):
+    llm_manager.set_qdrant_manager(qdrant_manager)
+    print(f"✅ Collection '{collection_name}' loaded and linked to LLM.")
+else:
+    print(f"⚠️ Collection '{collection_name}' not found or empty.")
+def reset_textbox():
+    return gr.update(value="")
+def show_spinner():
+    return gr.update(visible=True)
+def hide_spinner():
+    return gr.update(visible=False)
+def set_interactive_state(interactive: bool):
+    return (
+        gr.update(interactive=interactive),  # input
+        gr.update(interactive=interactive)   # send_btn
+    )
+def _add_trust_icon(text, level):
+    icons = {
+        "high": "🟢",  # alta affidabilità
+        "medium": "🟡",  # simile ma non perfetto
+        "summary": "🟠",  # riassunto usato
+        "low": "🔴",  # basato solo sulla chat
+        "no_context": "⚪️"
+    }
+    label = {
+        "high": "Reliable",
+        "medium": "Moderate Similarity",
+        "summary": "Summary Used",
+        "low": "No Source Match",
+        "no_context": "No Context"
+    }
+    icon = icons.get(level, "⚪️")
+    tooltip = label.get(level, "Unknown")
+    # Prepend semaforo + etichetta (puoi anche usare HTML per Gradio futuro)
+    return f"{icon} *{tooltip}*\n\n{text}"
+def get_summary(summary_type, history):
+    if history is None:
+        history = []
+    if summary_type == "map":
+        content = llm_manager.get_map_summary()
+        label = "🧾 Map Summary"
+    elif summary_type == "stuff":
+        content = llm_manager.get_stuff_summary()
+        label = "📚 Stuff Summary"
+    else:
+        content = None
+        label = "❓ Unknown Summary"
+    if content:
+        history.append({"role": "assistant", "content": f"{label}\n\n{content}"})
+    else:
+        history.append({"role": "assistant", "content": f"⚠️ No {label.lower()} available."})
+    return history
+def send_chat_message(message, history):
+    if history is None:
+        history = []
+    history.append({"role": "user", "content": message})
+    bot_entry = {"role": "assistant", "content": ""}
+    history.append(bot_entry)
+    response = ""
+    context_level = "high"  # default in caso non venga restituito
+    # Supporta nuova struttura restituita da stream_message
+    stream = llm_manager.stream_message(message, contextualize=True)
+    for chunk in stream:
+        if isinstance(chunk, dict):  # nuova versione con chunk + livello
+            response += chunk["content"]
+            context_level = chunk.get("context_level", "high")
+        else:  # retrocompatibilità
+            response += chunk
+        # aggiorna contenuto in tempo reale
+        bot_entry["content"] = _add_trust_icon(response, context_level)
+        yield history
+    yield history
+def authenticate(username, password):
+    if check_user(username, password):
+        llm_manager.initialize_conversation()
+        # Solo messaggi assistant (il primo dovrebbe essere l’initial summary)
+        assistant_msgs = [
+            {"role": "assistant", "content": msg.content}
+            for msg in llm_manager.messages if msg.type == "ai"
+        ]
+        return (
+            gr.update(visible=False),  # Hide login
+            gr.update(visible=True),   # Show chat
+            gr.update(value="", visible=False),  # Clear error
+            assistant_msgs  # Initial chat history
+        )
+    else:
+        return (
+            gr.update(visible=True),
+            gr.update(visible=False),
+            gr.update(value="❌ Incorrect username or password", visible=True),
+            []
+        )
+with gr.Blocks(fill_height=True) as demo:
+    with gr.Column(visible=True) as login_section:
+        gr.Markdown("### 🔒 Login Required")
+        username_input = gr.Textbox(label="Username")
+        password_input = gr.Textbox(label="Password", type="password")
+        login_button = gr.Button("Login")
+        error_message = gr.Text("", visible=False)
+    with gr.Column(visible=False) as chat_section:
+        chat_configuration = gr.Markdown("")
+        spinner = gr.Markdown("⏳ Sto pensando...", visible=False)
+        with gr.Row():
+            map_btn = gr.Button("🧾 Map Summary")
+            stuff_btn = gr.Button("📚 Stuff Summary")
+        chat = gr.Chatbot(
+            label="Video Helper",
+            type="messages"
+        )
+        input = gr.Textbox(
+            label="Input",
+            placeholder="Type something here..."
+        )
+        send_btn = gr.Button("Send")
+        stored_message = gr.State()
+        chat_history = gr.State(value=[])
+        map_btn.click(
+            fn=lambda history: get_summary("map", history),
+            inputs=[chat_history],
+            outputs=[chat]
+        )
+        stuff_btn.click(
+            fn=lambda history: get_summary("stuff", history),
+            inputs=[chat_history],
+            outputs=[chat]
+        )
+        input.submit(
+            fn=lambda x: (x, ""),
+            inputs=input,
+            outputs=[stored_message, input]
+        ).then(
+            fn=show_spinner,
+            outputs=[spinner]
+        ).then(
+            fn=send_chat_message,
+            inputs=[stored_message, chat_history],
+            outputs=chat
+        ).then(
+            fn=hide_spinner,
+            outputs=[spinner]
+        )
+        send_btn.click(
+            fn=lambda x: (x, ""),
+            inputs=input,
+            outputs=[stored_message, input]
+        ).then(
+            fn=show_spinner,
+            outputs=[spinner]
+        ).then(
+            fn=send_chat_message,
+            inputs=[stored_message, chat_history],
+            outputs=chat
+        ).then(
+            fn=hide_spinner,
+            outputs=[spinner]
+        )
+    login_button.click(
+        authenticate,
+        [username_input, password_input],
+        [login_section, chat_section, error_message, chat_history]
+    ).then(
+        fn=lambda history: history,
+        inputs=[chat_history],
+        outputs=[chat]
+    )
+if __name__ == "__main__":
+    demo.launch()

config.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+from dotenv import load_dotenv
+from utilities.llm.LlmManager import LlmManager
+from utilities.vectorstore.QdrantLangchainManager import QdrantLangchainManager
+from langchain_openai import ChatOpenAI
+from sentence_transformers import CrossEncoder
+MODEL = "gpt-4o-mini"
+LANGUAGE = "en"
+crossencoder_model = None
+credentials = {}
+def check_user(username, password):
+    if username in credentials and credentials[username] == password:
+        print("🔑 Login successful!")
+        return True
+    else:
+        print("❌ Incorrect username or password")
+        return False
+def initialize(app=None):
+    global crossencoder_model
+    try:
+        load_dotenv()
+        if crossencoder_model is None:
+            print("Loading CrossEncoder model...")
+            crossencoder_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
+            print("CrossEncoder model loaded!")
+        credentials[os.getenv("USERNAME")] = os.getenv("PASSWORD")
+        i = 1
+        while os.getenv(f"USERNAME_{i}") and os.getenv(f"PASSWORD_{i}"):
+            credentials[os.getenv(f"USERNAME_{i}")] = os.getenv(f"PASSWORD_{i}")
+            i += 1
+        qdrant_manager = QdrantLangchainManager(
+            qdrant_url=os.getenv("QDRANT_URL"),
+            qdrant_api_key=os.getenv("QDRANT_API_KEY"),
+            crossencoder_model=crossencoder_model
+        )
+        llm_manager = LlmManager(
+            qdrant_manager=qdrant_manager,
+            model=MODEL,
+            language=LANGUAGE
+        )
+        return llm_manager, qdrant_manager
+    except Exception as e:
+        print("Error initializing configuration:", e, flush=True)
+        return None
+    # Qdrant
+    # try:
+    #     qdrant_manager = QdrantLangchainManager(
+    #         qdrant_url=os.getenv("QDRANT_URL"),
+    #         qdrant_api_key=os.getenv("QDRANT_API_KEY"),
+    #         llm=ChatOpenAI(model="gpt-4o-mini", streaming=True),
+    #         crossencoder_model=crossencoder_model
+    #     )
+    #     config['qdrant_manager'] = qdrant_manager
+    #     config['qdrant_connected'] = True
+    # except Exception as e:
+    #     config['qdrant_connected'] = False
+    #     print(f"Error connecting to Qdrant: {e}", flush=True)
+    return config

data/txt/Key statisitcs startups.txt ADDED Viewed

	@@ -0,0 +1,67 @@

+(Transcribed by TurboScribe.ai. Go Unlimited to remove this message.)
+Hi everyone and welcome to this video lecture from the Entrepreneurial Literacy Initiative. I am Martti Wask and today we're going to present to you some brief key statistics about entrepreneurship. Before we go into the mud, let me give you some brief definition of what I understand about a startup.
+So to some extent startups are just like any other business, right, or more specifically small businesses because they're small at the beginning. But there are three characteristics that make them unique. Startups are young, high growth orientated businesses.
+Look at these two GIFs. So on the left hand side we've got the small or the traditional business, right. You see how the kid is turning into a woman or even an elder woman at a very low pass.
+In contrast the startup is making it up to the adult life very suddenly. A second feature or a third feature that is characteristic of startups is that they are disruptive in nature. They're here to disrupt traditional industries.
+Think about this small business about two or three taxes, right. And then Uber comes or Lyft comes and they put some technology to make everything much more efficient from the customer standpoint. So they just break the traditional industries who are just left with policies and regulations to get rid of these new innovations.
+Same thing for CDs which is something that you may not have used in your life but perhaps you've seen it at home with your parents. So there used to be these CD stores in the cities where you just went there and buy some CDs. Now the music industry is much more on streaming almost for free or these freemium business models and going into concerts.
+Now let's jump on to some statistics about startups. We're going to start from very macro type of things and just I don't want to call it startup dynamics but rather firm dynamics because this is the firm births and firm deaths from Eurostat in year 2019. So this is just to give you an idea of the number of companies that are born every year which is more than a quarter of a million for bigger countries such as Spain, Italy, France, Germany, Poland and Turkey.
+And as for firm deaths we've got in those same countries something north of 130 000 companies that die. So there is a positive net effect of more and more businesses being created every year. You can check out more on these statistics at the European Commission website.
+Some other and perhaps more relevant to us statistics are the number of high growth firms in each country. If you are increasing your employee base by 10 percent or higher you're going to be considered as per Eurostat a high growth company in the year 2018 which is this specific statistic. And what we can see is that the number of firms that are considered high growth firms are a little over 15 percent in countries such as Greece, Netherlands, Spain and Ireland.
+Another important statistic is the context. So to become an entrepreneur it is fundamental the context the ecosystem in which you live. And the Global Entrepreneurship Monitor the GEM report which is very widely known it's a worldwide survey of adult population regarding entrepreneurship topics.
+And so it essentially tracks four different dimensions. First the attitudes and perceptions of adults towards entrepreneurship, second the motivational aspects towards entrepreneurship, then the actual activity of entrepreneurialism and the impact of this entrepreneurship. As per the Netherlands we observe how 60.8 percent of the adults who are surveyed consider to know someone who has started a new business recently.
+Another remarkable statistic is that 82.9 percent of Dutch people consider to be very easy or easy to start a business. And this makes the Netherlands to rank number three in the list of 43 countries surveyed. As per the entrepreneurial intentions which is the question of whether people are expected to become entrepreneurs in the next three years, 13.1 percent of adults say so in their responses to the question.
+And this ranks the Netherlands number 28 out of 43. Some other important statistics are the total early stage entrepreneurial activity. This is a statistic that is created on purpose by the GEM report and it essentially tells you the number of adults who are nascent entrepreneurs or who recently became business owners.
+In the case of the Netherlands 11.5 percent of the surveyed adults did so. So there's a tear ratio of 11.5 percent. Seven percent are established business owners so they became business owners longer than say 42 months I believe it is.
+And another bucket that is relevant for the context of entrepreneurship in the Netherlands is the motivations. So Dutch people become entrepreneurs to make a difference and to build great wealth which means a lot about the optimism of Dutch adults. So these are kind of the top two reasons by which Dutch people Dutch individuals become entrepreneurs.
+Another important dimension that the GEM report tracks is the entrepreneurship impact. So 1.5 percent of Dutch adults consider to be giving employment to more than six people within the next five years or in five years time. And two percent consider to be selling their products and services to an international base.
+International meaning the revenues of the firm having 25 percent or more coming from international markets. And these ranks the Netherlands number 11 out of a list of 43 surveyed countries. In comparison we've got also the Swedish statistics.
+And so in comparison to the Netherlands a little less people know someone who has started a business only 48.5 percent as opposed to 60.8 percent for the Netherlands. 80 percent 80.1 percent considers that it is easy to start a business and 8.3 percent has entrepreneurial intentions. So a little lower than the Netherlands in all these variables.
+As per the motivation to start a business these are the two top reasons again to make a difference and to build great wealth. This is similar to the Dutch case. For the entrepreneurship impact job expectations just 1.1 percent of adults consider to be providing employment to more than six people in five years time.
+And only 1.4 percent or 1.4 percent of adults consider to be selling their products and business to international markets as opposed to two percent for the Netherlands. In the slide deck I provide some other countries that you can check if you like. But interestingly and talking again about context or ecosystems the Global Startup Ecosystem Record 2021 record offers a ranking of entrepreneurial haves.
+So where there is the ecosystem or context to start a business. Of course no surprise that Silicon Valley comes in number one. Then New York City and London and Beijing being number four.
+So the Chinese hubs are hitting strong in this rankings more and more. Then we have some other US cities like Boston or Los Angeles. Israel is number seven.
+Shanghai and other Chinese have their Tokyo and Seattle making it to the top 10. Some other important cities European cities aside from London and if you will Tel Aviv are Paris and Amsterdam or the Delta region let's say Utrecht Rotterdam etc. Then we've got Stockholm tying in number 17 with Singapore.
+The Swedish ecosystem is very noteworthy. There are some very recent a billion dollar startups such as Spotify or Klarna which are very remarkable cases. And then we've got Berlin making to the bottom of the top 25 list for the best entrepreneurial ecosystems.
+Now if we go on to firm survival just to make you an idea of you starting up a business what are your chances to make it to the next year. So Eurostat here provides statistics of survival rates for one year the light blue, three years the orange ones and five years the dark blue. So these are for 2018 and if we take into account the dark blue we observe that more or less they go on to 40 to 60 percent of survival rate meaning that you have let's say make an average of 50 percent one out of two businesses wind down after five years of operations.
+So you have a 50 percent chances of success if you will. Finally and now going a little bit more into the micro level into the firm level and analysing startup profitability. So most of startups are not profitable right from the beginning.
+So here you have on the right side the pre-seed and so the different life cycle of the firm pre-seed and startups. You can see an orange coloured and red coloured that they are not profitable yet. So the orange colour is that they expect to break even they expect to be profitable in more than two years.
+So still a long road to profitability and the red colour is they expect to become profitable to break even in less than two years. So essentially in the pre-seed and the startup stage is very common to be unprofitable and once you go a little bit more on to more mature firms the profitability turns into something more normal so to speak. So this lack of profitability means that the startups really require of external financing to get their operations going and I want to give you because financing is kind of a key topic a hot topic when we talk about startups just a very brief idea of what we're talking about when we're dealing with financing in the startup world.
+So essentially you have four ways in which you can finance the business revenue which is the preferred one it's essentially your customers are providing you the money to run your operations and this is by far the best way to finance your business. Then you've got the debt that is a bit delicate because banks or whoever who lends you money can lend you the money but sometimes they ask you for personal guarantees. So think about it a startup you just created the startup there's no track record there are no assets that you can use as collateral there so the debt holders will really ask you for guarantees.
+Watch out what type of guarantees you provide. I've seen entrepreneurs actually bucking up the money that they use for the business with their personal wealth and this is extremely dangerous. If the company eventually goes south then you will have to pay with your assets be it a car be it a motorbike be it your future salary to the debt holders and so that's why we created the limited liability company to limit the liability of the entrepreneur in this case.
+So try to avoid as much as you can backing up some money into the into the business with your personal wealth and be aware that a business sucks money burns cash very easily just by having a couple of employees only two paying some office rents some supplies some marketing ads that you want to do on Facebook, Google, Instagram etc. This all can easily eat you up to a hundred a hundred fifty thousand euros in a year. So you really have to watch out what you back out with your personal wealth.
+So having to return a hundred fifty thousand to the bank right when you're 25 or 30 is not the best thing you want to do with a death company. Third you have the grants the grants are eventually like free money so it's they're wonderful but the problem is time with there you cannot count on them to run your operations they're going to come in one year time they're going to come in 1.5 year you will always be waiting for that ministry signature that needs to be given before the money can be released so it's really tiring long processes a lot of justifications they're wonderful because it's free financing and you always need to do especially if some there are some lines that really match your startup or the innovation that you're trying to pursue but you cannot count on them. And finally you're left with equity which is the traditional type of financing a startup that aims to scale up very fast and so with the equity you have different instruments or you have different actors at the different stages of the firm you can start at the pre-seed and seed stage with angels business angels well you really start with the family and the friends right once you've used up those funds then you have to go with angels business angels which are the first type of professional investor that you're going to get for some venture capital funds they like to go and really have an eye at pre-seed and the seed stage even though they're more they're more acting when when there's some sort of growth there's some sort of metrics going on there is when you can attract venture capital and eventually if there's strong growth the venture capital will sell their stake into the private equity actors so the expansion or the maturity and the private equity together with the venture capital will take the business either to an m&a to a big m&a transaction or to uh to ipo if successful or to another private equity fund that is bigger than the one you just received so there are several exit strategies for them let me finally give you some personal reflections on what it takes to be an entrepreneur and if you're lucky enough a successful entrepreneur first thing that i would advise you to do try to focus on your customer try to focus on your product and how to reach the customer make the customer love your product because if you have some traction then things come and things i mean financing i mean uh a lower marketing expenses because you're going to get known through your customer base i mean things will happen if the customer loves the product so the very first thing to do is to build up a nice product it doesn't need to be developed to a great extent but some product that really tackles some issue some societal issue and therefore it's bought right that's the best thing to do and don't worry about the tricks and about the abc of how to raise financing etc if you have a steep revenue line things will come to your company second reflect about the difficult side of being a successful entrepreneur if you're lucky enough to get into a successful company you are the dad of the business which means that everyone expects a lot from you you cannot show any weakness you have to be there 24 7 so if you go on holidays and there's any fire any alarm there it's going to be you that have to wipe that out also you can get easily tired with the project so once you've done two three four years of the same stuff you can get tired of the project but guess what you cannot just abandon you have an employee base of 10 15 people say 40 50 or hundreds if you're successful enough and so they count on you you are the leader and this is a burden that not so many people think when they start into entrepreneurship but you need to think about that you're going to be the leader and it's not that easy to jump down from the bandwagon especially if you have venture capitals on board because then there will be ownership contracts there will be some contractual ties that will tie you to the business for several years and you cannot undo those contracts so there are some things that you need to think about because before you get an into a entrepreneurship of course if you don't get into venture capital you just start your nice idea that you want to scale up you have this growth adventure ahead of you or you expect to have it and eventually it doesn't work you can always transition into a small business and make a very good and decent living out of that have your work-life balance there so it's not that one thing goes without the other you can always transition from having a startup idea and then eventually transitioning to small business because all that traction that you expected is not there or the opposite you can start as a small business I've known some consultants that eventually think about how to provide consulting on a high scale level and then from this consulting a small business they turn into startup so you can actually go the two ways but think reflect before beginning with an entrepreneurial career there are some contractual ties you can get tired of the project and this is something that it commonly happens I would say and no one talks it openly it doesn't make it to the headlines for no reason entrepreneurs have high degrees of depression and mental problems so there is a study by professors at Berkeley showing that entrepreneurs have a 30 percent more probability to be depressed as opposed to other adults another tip do not hurry to start an entrepreneurial career you don't have to start right when you're 20 21 in fact you will see in future modules that you will increase the odds of success if you wait a little longer so unless you have the next Facebook and you experience that your side projects that you're studying as a student as a graduate student or whatever or early employee traction a lot such as Facebook etc etc then is when you're going to face a decision to make am I turning an entrepreneur right now at the age of 20 or do I wait a little longer then it's it's kind of a live decision but before if you're thinking about an idea do not hesitate to take some some working experience at good companies if it's a startup company perhaps even better and also just a final note as entrepreneur and especially if you are successful enough you're going to be surrounded by a lot of people by a lot of people that will tell you how charming they are and how good they've been in advising that other company in being a mentor of that other company by the way in exchange of one percent or two percent of the company so just trust your intuition this is nothing new you've been dealing with people for all your life so try to smell how the person him or herself is before engaging further with him or her there are tonnes of people that will try to convince you that it's worth engaging further with them watch out them because there are many well I hope you enjoyed the video lecture just nothing else stay tuned for the next modules of A.L.I. that will be as exciting as this video lecture bye-bye
+(Transcribed by TurboScribe.ai. Go Unlimited to remove this message.)

data/txt/Risk and Return part 1 v1.txt ADDED Viewed

	@@ -0,0 +1,63 @@

+(Transcribed by TurboScribe.ai. Go Unlimited to remove this message.)
+Hello everyone and welcome to this video lecture from the Entrepreneurship Literacy Initiative. I am Marco Darin and today we are going to look at risk and return in entrepreneurship. This is actually the first of two video lectures on the topic that will look at both the facts and interpretation of how entrepreneurs think and act about the risk and the returns that they can reap from their entrepreneurial endeavours.
+Now let's start with something that you have already seen in a previous video lecture with Marti. The fact number one that you have looked at is the high default rate which is typically over 50 percent in the first few years after the creation of a company. And this means that companies are often left stranded and don't yield any return to their initiators, to their founders.
+And this is a picture which tries to convey this image of total dismay and failure. Second fact is that at the same time while entrepreneurial companies often fail, they have a high potential for achieving extremely high returns. And sometimes these are really huge and in fact as an illustration we can think of the Elon Musk IPO of Tesla a few years ago and we know that the sky is the limit even though SpaceX is still very far away from its IPO.
+Now what are the risks and returns to entrepreneurship really? These two facts, high default rates and high potential, seem to be at odds among themselves because it's very risky and often it doesn't yield much but it could even be a big lottery ticket. Now let's look more at what economic and finance research has dealt and and digged out from many different studies. Now the first thing that we want to bring you is some key facts.
+And here is what we are going to see today. In the second lecture we'll try to make sense of these facts and provide some deeper interpretation to think how to act about it. Now let me give you a preview today.
+What are these key results? The first is that there is a high dispersion of returns across different ventures which means that some of them are extremely unsuccessful, typically fail, typically fail quickly, sometimes fail painfully after quite a few years. But at the same time on the other hand, which means on the other extreme of the distribution, we see that some ventures achieve very high returns. Now the second key result is that this dispersion is then compounded in the fact that the average entrepreneur does not really earn a high return and they return enough to especially compensate for the risk of failure which is very high.
+It's a little bit like trying to get through a fire and knowing that you are easily burnt a little bit and so the typical fire worker is not going to enjoy it very much. Now this is something which is documented by many studies across different periods and countries. Don't worry, I'm not going to bore you with a full review of this, but I'll just like to look at three of them which bring out some very interesting evidence.
+Now let's start with this study published in the Journal of Economic Perspectives which tries to go back to what it calls the roots of entrepreneurship, trying to understand why entrepreneurs seemingly take actions which don't square very well with our view of what rational agents often do. Now this, which is basically a survey of previous studies, provides this very interesting picture and the picture shows the pattern of failures and successes across about half a million firms founded in the United States in 1996. So it looks at a cohort, it's a full population of these companies and then it looks at sales six years afterwards in 2002 and now the y-axis provides the share of firms which experience a certain outcome and this outcome in terms of sales is put as buckets in the x-axis.
+Now what do we see here? It's very interesting because if you notice almost exactly 50 percent of the companies fail, so they experience zero sales. Failure is extremely common, it's the most common outcome. But then if we go to the next buckets, what do we see? Well we see that another about 10 percent achieves very low sales, less than a hundred thousand dollars and that may seem a large sum to you but if we think that these are sales, not profits, they are after all not so much for a business after six years of its creation.
+Probably they just barely helped the founder make it. Then if we go further in sales, we see that a number of another 30-20 percent of firms achieves returns which could be kind of decent if it's a small corner shop but they're not very good if it's a more ambitious business and these are kind of between 100 and 500 thousand and they can go up to a million. And so we see that there are also about eight percent of companies which reach very good sales, one to five millions.
+But then the other very interesting fact is that as we go to really really good outcomes, let's say above five millions, between five and ten, we get just two percent, about two percent of these companies. And if we think about the big earners, more than ten millions, and again these are sales not profits, well we have about one percent. So this is basically what I was saying before, putting together the two facts.
+First that failure is extremely common and that these small results are the norm. Second, that there is a wide dispersion because if we could really kind of put not just buckets but points here, we would go extremely far away on the y-axis and we see that this is a very small number of firms which really make it okay. Now this is clearly just the observation about companies founded in one year, 1996, so one could think that this is kind of a non-representative sample.
+No, it is because many other studies find very similar patterns across different times and countries. So this is really a key feature of entrepreneurship. The second study looks a little bit more closer at US private companies and it's a study by Tobias Moskowitz and Annette Bissing-Jorgensen, which are two very successful financial economists, which looks at what they call the private equity premium puzzle.
+So they say that entrepreneurs invest in their companies which are private, mostly remain private, just a very very tiny number goes public, and so this is private equity in that sense, equity which is privately held. And where does this puzzle stand? The puzzle stands in the following facts. So they look at a representative sample of 4,000 US households which own a business and they follow it through a decade from 1989 through 1999.
+Interestingly, first of all, these households hold the vast majority of their companies, so they get somebody else to invest in the company. In a few cases, it's a business angel or a venture capitalist or another company or sometimes even the state. Most other cases, it's other people like friends, families or acquaintances.
+And therefore, they hold most of their wealth into this company, which could be their corner shop or it could be a small business or could be a more ambitious venture. A key feature is here that these companies are highly liquid, which means that it's very difficult to sell this equity. So if for some reason you need to sell part of your holdings, it's very difficult because it's a private company and it's not transparent and very few people would have the guts to invest in something which is so risky because they find it very difficult to understand it, to know it and to run it.
+And so beyond one key point of this paper is that beyond failure risk, there is also the risk of illiquidity that you need to get some money because maybe you need it for medical reasons or for sending your kids to school or for moving to a different location and you find it very difficult. So that's another dimension which entrepreneurs should take into account and we are going to see this more into the second video lecture about this topic. Now, having introduced illiquidity risk, they find that the return to private equity, equity held in private businesses is not statistically higher than that to public equity.
+So engaging in an entrepreneurial venture or being an employee, developing your own human capital, getting a good job and investing your savings in public equity in the stock market, you have very similar returns. And these are similar in level but also in volatility. And so the puzzle here is that these two authors estimate a loss of about 10% per year if adjusted for failure and illiquidity risk to holders of this equity, to entrepreneurs.
+And clearly, this is quite puzzling also because it's not a mistake. These people keep investing in their company over the decade over which they are followed by this survey. And so this to economists brings a puzzle.
+Why do people behave like this? Now, a third paper adds another piece to the puzzle. And this is a paper by Bob Hall and his wife and non-academic economist, Susan Woodward. Bob Hall has been the doyen of American macroeconomists for several decades.
+And he has been the chair of this committee which declared the start and the end of recessions in the US. And so he knows a little bit about kind of how the vagaries of the economy, the downturns and upturns work. And he applied this knowledge by looking at a sample of a venture-backed company.
+So this is not just the whole population of private businesses, but it's the subset which is funded by venture capital first, which in a way we could think is different from the others because these are more ambitious companies. And they also call their contribution, their study, the burden of non-diversifiable risk of entrepreneurship. And they also focus on the fact that there is something more to failure risk and illiquidity risk.
+And in particular, they look at 22,000 VC-backed US businesses between about 1987 and 2007. And what do they find? Is that entrepreneurs are highly diversified. This is true both for the general population of entrepreneurs and for venture-backed entrepreneurs who put not only all their financial capital, but also all their human capital, their reputation into play into one single business.
+Now, they also find that there is, and that's an interesting fact, a negative correlation between exit value and time to exit. So in most cases, the most successful exits, the most successful returns to entrepreneurs and investors are made when a company is exiting through an acquisition or an IPO in a very short time. So in that sense, being fast is being good.
+And last but not least, and very important for our attempt to understand risk and reward, is that about three quarters, 75% of these entrepreneurs reap no rewards. Part of it is done to the type of contrast that venture capitalists give when they provide financing, and venture capitalists are very savvy investors. So they are able to make sure that they recoup most of their investment, even if the company doesn't do very well, which means that in most cases, if the company doesn't do extremely well, the entrepreneurs don't get any rewards.
+And again, this is puzzling because why do these very smart, well-equipped, very educated people engage into activities which are highly undiversified? Okay, so they get a lot of risk, a lot of failure and illiquidity risk. And unless they really do very well, they will not gain much in terms of financial rewards. So this brings us to our preliminary conclusion, preliminary because it's the first of two video lectures, which is to bring up these facts.
+Entrepreneurship is highly risky, it has strong variation in returns, some people do extremely well, most people don't do well at all. There is substantive failure risk and holding equity in your company is highly liquid because it's difficult to sell it to outsiders. And if we compute returns and adjust them for these two types of risk, especially, then the results are disappointing.
+Still, entrepreneurs keep opening businesses. Why? This is an interesting puzzle that all these empirical set of studies, literature brings us. And so I'll leave you with a little bit of curiosity.
+And we'll pick this up again in the next video lecture, where we provide some additional facts and some conceptual framework to understand what is going on and make sense of this appearing puzzle. Entrepreneurs certainly are not stupid. They're not reckless people.
+They mostly know what they're doing, but they seem to defy standard economic views. Why? Well, stay tuned and in the next video lecture, we'll explain you why. Thank you and see you soon.
+(Transcribed by TurboScribe.ai. Go Unlimited to remove this message.)

data/txt/Risk and Return part 2 v1.txt ADDED Viewed

	@@ -0,0 +1,123 @@

+(Transcribed by TurboScribe.ai. Go Unlimited to remove this message.)
+Hello everyone and welcome to this video lecture from the Entrepreneurship Literacy Initiative. I am Marco Darin and today we are going to talk about risk and return in entrepreneurship. This is the second part in a two video lecture series which tries to understand the risk and return of entrepreneurial activities.
+Now let's pick up where we left last time. So we have shown that entrepreneurship is a highly risky business and it exhibits a strong variation in returns with most ventures fading miserably and a few of them, very few of them doing well or extremely well. So failure risk and liquidity risk are particularly high in this realm and we have shown that risk adjusted returns to entrepreneurship seem to disappoint.
+And so we cannot think really that entrepreneurs are solely driven by financial returns because that is not the case. Still, entrepreneurs keep opening businesses. Why? Today we are going to explain.
+So there are three main explanations that could reconcile the evidence and our economic intuition. The first one is that entrepreneurs are different. So they have different risk preferences than the general population and in particular they are more risk tolerant.
+They engage in entrepreneurship because they can bear more risk. Second potential explanation is that there are not only financial returns but other types of private benefits in the sense that entrepreneurs receive some benefit which is private, which is not financially in monetary terms and therefore they are compensated with more than just financial returns. Third potential explanation is that entrepreneurs are different because they are overconfident.
+They are too optimistic about the future of their ventures. Now let's see a little bit more of concepts and evidence about each of these potential explanations. First of all, let's look at risk preferences.
+Now to understand well the importance of risk preferences, let me introduce the risk-return frontier, which is a key tool in financial economics. Now consider the following example. A project is offered to you which has an expected return of 10% with a risk of 20%.
+As you can see here, the risk means that the return will fall between 9 and 11%. It's 20% of the 10% which is expected. Now any point in this segment between 9 and 11% is possible and this is what is offered to you.
+Now consider an alternative. Consider now that risk goes to 30% and so the expected return is always 10% but now the possible outcome has a wider range between 8.5 and 11.5 which is 3 percentage points now. Now let me give you a few seconds to think.
+Which one would you prefer between these two projects? Think about it. Make up your mind. Okay, let's see.
+Clearly there is no right or wrong answer. It depends on your preferences towards risk and a lot of empirical evidence tells us that most people, most individuals behave in a risk-averse way so they need to be compensated to bear more risk. So if the risk moves from 20 to 30% of the expected return, then you need to be compensated because there is a possibility, yes, that you can do better, 11.5 instead of 11, for example, but you can also do worse, 8.7 instead of 9.5. So the extremes are pushed out, the boundaries are pushed out, and so that is the fact that there is more risk.
+And most people don't like this. So a typical person with a risk of 30% would not be willing to accept this second project, would stick with the first one or would accept a higher risk if the expected return is higher. In this case, 12%, it's just an example.
+So with 20%, 12%, sorry, the 30% risk is equivalent to 3.6 percentage points, and so the range of possible values goes from 10.2 to 13.8, so 1.8% above or below 12%, which is the expected value. Some people may be even more risk averse and require 13% or 14% of compensation, pushing the boundaries further away. Now, why is this important? This is important because we can also express this through what we call the risk-return frontier, which is simply the set of all points which link the accepted return to different levels of risk.
+In this case, we see the two points that we have seen before, and we can think that they are on a concave trajectory. This could also be linear or convex. There is a lot of work on this, but the essential point for us is that more risk must be compensated by a higher expected return.
+That's what we need to know to understand the entrepreneurial choice. Now, what are the data about this? Well, a conjecture is that, as I said before, entrepreneurs are more risk tolerant than the general population, so they're willing to start riskier businesses than the average person. Now, this is confirmed by quite some literature, in particular by two studies that we briefly look at now.
+The first study is by Bob Hall and Susan Woodward. It's the one that you have seen in the first video lecture. What do they do in the second part of their article is to consider what they call the certainty equivalent of entrepreneurial opportunity.
+For each project in those 22,000 venture-backed startups that they consider, they estimate how much the entrepreneur, based on the returns that they receive in the actual data, would be willing to pay to own the project. Clearly, if this is a positive sum, it's the value of the project to the entrepreneur. If it is a negative sum, this is how much the entrepreneur needs to be compensated to undertake the project.
+They do this for a number of different parameters. In the first column here, we see different degrees of risk aversion. Then for each degree of risk aversion, they see what is the yearly income of the entrepreneur per year in his or her previous job.
+Then they assess, they look at how much wealth accumulated the entrepreneur has at the beginning of the project. Why? That's very interesting. You will probably wonder why.
+Well, we know that risk aversion depends a lot on our wealth. Wealthier people tend to be more risk tolerant. Why? Because they have a cushion, so they can take more risk than somebody who needs just from hand to mouth.
+Therefore, here they consider four possible different wealths. 100,000, 1 million, 5 million, or 20 million at the beginning of the venture. Now let's see how these combinations work out.
+We see that for very low risk aversion, this is a relative coefficient, basically all entrepreneurs are willing to pay some money to own their project. This clearly is something which decreases with their pre-tax income. Why? Because this is a better outside option.
+If I'm earning, like here, 2 million, I will be only willing to engage in a project if it has a really high expected payoff. Otherwise, I'm not going to take it up, because my current job is already quite lucrative. Now we see that what is interesting, the key message is that the higher the risk aversion, the less the entrepreneur is willing to pay, which is what we would expect based on economic reasoning.
+In particular, we see that already with an intermediate level of risk aversion, well, many entrepreneurs are not willing to pay much. They're willing to pay little, or they sometimes need even to be compensated. This is certainly true when risk aversion becomes fairly high.
+Two is a relatively high coefficient of relative risk aversion. What does it mean? It means that, yes, risk aversion plays a role, and less risk averse people tend to be willing to engage in entrepreneurship more often. This is more true if their worth is higher and if their current employment is lower.
+Then you can go back to this slide and see all the possible combinations. This is a very interesting result, because it comes out of actual data. A similar result is obtained by a Norwegian and Greek team of economists, which look at Norwegian data, at a cohort of 400,000 Norwegian people, and they obtain this interesting result.
+They test the joint hypothesis that entrepreneurs are more risk tolerant, and therefore they found businesses more often. Second, these businesses perform worse. Why? Because they are willing to accept, for a given level of risk, a lower expected value.
+That's the risk-return frontier that we have looked at before. Notice that this contradicts the prediction of the risk-return frontier. The fact that you will be willing to engage in entrepreneurship in risky business only if you are compensated with a higher return.
+Here, if you are very risk tolerant, that frontier will tend to be flat, or almost flat. It's not upward sloping as we have seen before, which is a general attitude of people. In that sense, yes, entrepreneurs tend to be slightly different than the general population.
+They find in this Norwegian data evidence that confirms these hypotheses. These preferences appear to provide some explanation to our puzzle. This is the first learning point from this video lecture.
+Now, let's turn to see how this plays in our previous example, so that we can relate to the little theory that I proposed to you. We see that instead of going for a 12% expected return to accept a riskier project, what both these studies tell us is that probably entrepreneurs will be willing to accept a very small, only say 11% increase in expected return. Some of them may even be willing to go for higher risk with the same expected return, because they are not just risk tolerant, but they are risk loving.
+They like the fact that they are engaging in a risky business. This clearly is an empirical question. It depends on how individuals behave.
+Now, let's move to the second potential explanation. The existence of private benefits which are not factored into the financial returns of the venture. Here we look at just one study.
+There are many, but time is clearly pressing. We look at one very recent and intriguing study, which looks at entrepreneurship as a way to keep options open. Sylvain Catherine, a French economist who works at the Wharton School at the University of Pennsylvania, looks at French administrative data, very detailed data, and explores the idea that clearly funders may reap different types of unmeasured benefits.
+There are three possible ones. One is non-pecuniary. For example, they enjoy the entrepreneurial experience.
+They really like running a company. They get this out of their experience. That doesn't have a monetary measure.
+It's enjoyment, exactly as much as you like studying. You just like it. You don't need to be compensated for it.
+You just go for it. You're just listening to this video lecture. Very good.
+The second possibility is that people like independence. That's also been documented, that people don't like to have a boss, and so they like to be their own masters. That is, in itself, something which gives them utility, satisfaction.
+A second line of private benefit is legal tax avoidance, so unreported earnings. The fact that many small businesses manage to obtain favourable tax treatments or avoid taxes very legally, sometimes also illegally, but that is clearly very difficult for us to measure. But even legal tax avoidance has been shown to be contributing a bit to the creation of small businesses.
+Not so much ambitious ventures, because those will operate in a very transparent way as being incorporated and then being acquired or IPOed. The third dimension is that founders may enjoy the experience that leads to better post-failure employment. Even if you fail or if you sell your company, you may get back to the job market and obtain an employment which is as good or even better than the one that you had before.
+This is the one dimension where this economist, Catherine, focusses on. He focusses in particular on the option to return to the job market. The finding is that this is material.
+I have to drop down to show this to you. We are also learning how to do these video lectures, you see. The result is that for the average French entrepreneur, the option is worth, over the typically six years of engagement in the venture, about €137,000, which is quite some money.
+What does it mean? It means that the experience that is accumulated in these years is worth €137,000 on average when they go back to employment. The other interesting result is that the option to return as an employee, the fact that you know that you can count on going back, that if you sell your company or even if it fails, you are not going to be on the door. You will find an employment.
+This accounts for about 40% of new business creation in France in about 20 years that Catherine explores in the data. This is also something very interesting. People get private benefits of different sorts, and this motivates them to jump into entrepreneurship.
+Last but not least, overconfidence. Entrepreneurs are different from the general population because they are more optimistic. They see the reality as rosier than it is.
+Well, as this highly cited psychology paper tells us, this is not so easy to find out. The title, The Trouble with Overconfidence, says it all. These two US psychologists argue the following.
+They say that there are three concepts which are actually hidden into this overconfidence. One is overestimation of one's ability. The fact that we think that we are better than we are.
+The second is what they call overplacement. The fact that we tend to place our own skills above those of others. The first one is an absolute, and this is a relative type of concept when we interact with other people.
+Third, overprecision. We tend to have an excessive confidence on the accuracy on our own beliefs about the future. Now, much of their analysis is conceptual and shows that it is difficult to build and disentangle these concepts in an empirical setting.
+So it is very difficult to do some research in this direction and find out compelling results. So in that sense, we should be cautious. There is one interesting study, though, which brings up quite an interesting result.
+This is by Manju Puri and David Robinson, both at Duke University in the US. Now, what they find, they do something which is quite smart. So they build a measure of optimism which bypasses the criticism of the psychologists.
+And they say, hey, let's build a measure which is not something which relates to your entrepreneurial experience or to the realm that you are going to investigate, but it's something which is orthogonal to it, different. And so they ask people, they ask people in the sense they use a vast survey of US adults, what is their expected life expectancy? So how long will you live? Now, for each individual, they are able to build a very good measure because they use demographic and personal information which is picked up in the survey. And they use actual techniques which tell us that depending on some general coordinates of our life, health, and habits, they can get a very confident measure of our lifespan.
+Now, using this measure of optimism, they find that optimism positively correlates with entrepreneurial activity. So it's true in the sense that people who have a positive way of thinking tend to engage in entrepreneurial activity more often. At the same time, they find that extreme optimism leads to hazardous choices.
+So this doesn't make for good entrepreneurs. So it's not that the wild guys are wild in the sense of wildly optimistic guys are better entrepreneurs. No, but entrepreneurs tend to be people who see things positively and so probably this also helps them overcome difficulties.
+This is clearly the conjecture which would need more research to be validated. And so there are also studies which show that moderate optimism is correlated with lower forecast errors of the consequences of one's actions. So if you are moderately positive, you are also more precise in forecasting the future of your actions' consequences.
+And so this also makes for good entrepreneurs. So how do we put all these things together? Well, what we can conclude from this review of the excellent literature is that first of all, entrepreneurship cannot be explained only by the search for returns because we know that from the previous video lecture, this does not happen. Second, that several elements play an important role in this decision.
+And risk attitudes, private benefits are certainly important. And in particular, personality traits and own preferences play a role. So one conclusion that we draw which we think is interesting for you guys is learn who you are and what your life goals are is important to understand whether entrepreneurship is something which is good.
+It could be a good choice for you or not. So we leave you with this thought and interesting result and we conclude the video lecture here. So I hope you enjoyed it, that you find something interesting and stay tuned for more material coming from Eli.
+See you. Bye bye.
+(Transcribed by TurboScribe.ai. Go Unlimited to remove this message.)

data/txt/VL Myths about entrepreneurs.txt ADDED Viewed

	@@ -0,0 +1,83 @@

+(Transcribed by TurboScribe.ai. Go Unlimited to remove this message.)
+Hi everyone, and welcome to this video lecture from the Entrepreneurial Literacy Initiative. I'm Martti Wask, and today we're going to talk about myths about entrepreneurs. So many times when we think about entrepreneurship, we think about young college dropouts, right? Hi Mark, love you too.
+So we have the number of stereotypes that are mainly driven by the headlines that we read in newspapers, by blog posts that we consult every now and then, and essentially we see these young people making millions of dollars. Is that true? Well, that's what we are here for. Some of the stereotypes is that you have to go through well-established business accelerator programmes, such as Y Combinators.
+Many entrepreneurs getting out of the Y Combinator, also known as YC, were able to fund their businesses with millions of dollars. Some of the stereotypes is that you need to solve a founder personal problem, so you as a founder really know how to tackle the problem. You can also think of being the first to market, and so you get out or you get most of the customer base.
+You could also think of being placed or located in San Francisco, New York, Boston, these places, right? Where you can raise like a lot of money. So nothing far from reality. We're here to talk to you about two studies that will really give us some facts and knowledge about the main characteristics of successful entrepreneurs.
+And you'll see that most of them are not really young precisely. The first study that we're going to consult is done by Ali Tamaseb, who wrote this book, Superfounders, very recently, 2021. So he's got a sample of entrepreneurs from 2005 to 2018, and he differentiates which one made it to a multi-billion dollar valuations and which ones did not.
+And he tries to find out the different characteristics that make specific entrepreneurs to make it to the more than $1 billion valuation. So he run hundreds of interviews. He consulted thousands of LinkedIn profiles, crunch based profiles, et cetera, et cetera.
+Overall, he got 30,000 data points and he evaluates more than 60 characteristics of entrepreneurs. The second study that we're going to look at is done by Azule, Johns, Kim, and Miranda in 2020, published in the American Economic Review Insights. More academic, but don't be scary.
+We're only going to show some descriptive statistics tables. So they analyse every single startup that was founded in the U.S. between 2007 and 2009. And they track those startups for five years, employment and revenue, to be able to discern which ones were successful startups from those that did not grow that much.
+The good thing of this study is that they analyse the population. So they really get to the administrative data they're able to extract from the U.S. Census Bureau. So it's really not a representative sample, but instead the population as a whole.
+So let's start by the first myth, founder's age. Again, we all think about young college dropouts like Mark Zuckerberg or this Aaron Levy that I feature here, which is kind of this stereotypical entrepreneur. So he's the founder of Box.
+And back in 2005, when he was studying at University of Southern California, he was running an internship at Paramount Pictures. What else? Entertainment in California, right? Hollywood. And so what he experienced there is that people went with these huge files, huge video files.
+It's an entertainment company with a USB stick. And they were going office in, office out with these USB sticks. So what he did in his next college project was to gather some students around him.
+And they started or they presented this cloud storage platform that they eventually called it Box. So because the demand soared that much, he dropped out from college at the age 19 years old. And just 10 years later, 2015, the company was going IPO, achieving $175 million in proceeds and a valuation a bit north of $1.6 billion.
+So kind of the typical story, success story about entrepreneurs. Now, is this really true or not? So let's get into Azulay et al. 2020.
+What they did is they run different columns. So the first columns is all startups in the U.S. between 2007 and 2009. What is the average age of the founders? And what you can see is that the entire U.S. average age is 41.8. So rather old.
+And you can partition it by tech employment, by B.C.-backed firms, by firms that eventually register a patent, etc., etc. You see how the age is more or less the same. It's over 40.
+The columns to the right are those firms that grow the most. The top 10% firms, the top 5%, top 1%, and top 0.1%. Interestingly, what you can see is that the more to the right you go, the higher the age, the mean age of the founder is. So for all startups, it's 41.8. If you go to the right for the United States, the top 0.1% average 45 years of age for the main founder.
+The very last column, successfully exit startups, are those startups that make it to IPOs, so to the public markets, or to an M&A, so an acquisition by a big player. So as you can see again, the average age of these successfully exited startups is relatively higher than the normal age, than the average age of 41.8, by being at 46.7. So in essence, successful startups still do not have these typical college dropouts, which are rather an outlier. You can partition the sample by US region, and it doesn't matter whether you are in California, Massachusetts, New York, Silicon Valley, or some other entrepreneurial hubs like Texas, or Durham, Raleigh, and North Carolina, that the age is more or less the same thing.
+Myth number two, the number of founders. So it is said that you really need a team to found a successful company. And nothing far from reality.
+What you can see is that almost 20% of the successful startups, and this is from Ali Tamaseb, from the multi-billion dollar startups, are founded by a solo founder. This has some advantages. So there are no conflicts of ownership between the co-founders, conflicts about personalities, different visions, different goals, etc., that many times are one main cause of failure for startups.
+Now, one warning. Solo founders companies typically have founders that already co-founded a firm before. So they know what the process is about, and now they just want to run it solo.
+Perhaps to make most of the profits, right? Some examples of it is Langley Steinhardt from KarGurus, which IPO-ed in 2017, valued at more than 1.5 billion. So he was a former co-founder of TripAdvisor. Or Zhang Yimin from ByteDance.
+ByteDance is the company running TikTok. And so he is the main co-founder, and he already co-founded several other companies before, 99Fang and Kuxun. Myth number three, about education.
+And again, here I put you a list of startup founders that eventually dropped out from college. And look how successful they were. Mark Zuckerberg from Facebook.
+Michael Dell from Dell Computers. Bill Gates. Spiegel from Snapchat.
+Elon Musk. All these guys never graduated from college. And so these are the ones that are making the headlines most of the times.
+But how about the average successful entrepreneur? Are these outliers? And it seems that the answer is yes. So look at the distribution of how educated founding CEOs are. Again, successful founders that made it to more than a billion-dollar valuation.
+This is the study by Ali Tamaseb. A very high proportion of them have at least a bachelor. If not bachelor plus MBA or master studies.
+So again, founders' education seemed to matter. So the more educated you are, the more chances of success you're going to have. Yet, some of the most successful entrepreneurs are concentrated at top U.S. schools.
+You have to think that this is coming from Ali Tamaseb. And Ali was analysing U.S. startups. So this is totally biassed towards the U.S. But we can take it with the European mentality too.
+The fact of going to a top school, there's no denial that you have more chances to get into or to have found a successful startup. Here we've got a strong list of Stanford, Harvard, MIT, UC Berkeley, etc., etc. But there are some other universities that are not top Ivy League schools.
+Such as the University of Illinois, Chicago, UIUC. We've got University of Waterloo, the Brigham Young University, BYU, etc., etc. So there are some founders of very successful startups that never made it to a top school.
+Myth number four, work experience. Well, again, if you're a young college dropout, it doesn't matter whether you have some work experience that you're going to nail it anyways. And this doesn't seem to be the case.
+So what is the corporate experience before founding this successful startup? Well, 50% of the founders seem to have more than 10 years of experience. And now you may say, okay, but are these 50% of co-founders the ones that make it to the multibillion dollars valuation? And the answer is, well, yes. 30% of multibillion dollar founders didn't work for anyone else before, but the 70% remaining did.
+And if they worked before co-founding a successful startup, they made it at a tier one company, typically. And the tier one, again, we are in the U.S. market. This is the Google, Oracle, IBM, Yahoo, etc., etc.
+You can take it into your own country and think about what types of companies are the tier one companies. Also, work experience seems to pay off. What you have here with the red box is the probability of a successful exit.
+And this chart tells you whether the entrepreneur has experience in the same industry, considering industry as a NAICS industry classification code. So two digit NAICS, four digits NAICS, and six digit NAICS is panel C. So essentially what you have here is that if you have never, never, ever touched the same industry to which you're starting up the company, you have a 0.13% of probability of a successful exit. If you had one to two years of experience, this probability remains more or less the same at 0.13%. However, if you've been for more than three years in the same six digit NAICS code industry, then the probability of success raises almost double to 0.21%. So just to wrap up, most of the times you see a lot of stereotyping things that you see on the news, on blogs, on the web in general, etc., etc.
+These might be outliers, and we try to show you that in fact they are outliers. In this video lecture, we've been showing you that being educated, that having work experience, especially in the industry, that not necessarily being a young entrepreneur can really make up for a successful career in entrepreneurship. So as a matter of proof, we're attaching this podcast, this featured story about Ari Beldegrun.
+He is an oncology professor at UCLA, who eventually built up a couple of startups that make it to NASDAQ. So with this, we finish the video lecture. Hope you enjoyed it, and just stay tuned for the next materials from Eli.
+Bye bye.
+(Transcribed by TurboScribe.ai. Go Unlimited to remove this message.)

readme.md ADDED Viewed

	@@ -0,0 +1,36 @@

+**ENV**
+- ***create*** python -m venv .venv
+- ***activate*** .venv\Scripts\activate
+---
+**REQUIREMENTS.TXT**
+- ***generate minimal*** pipreqs . --force --ignore .venv
+- ***install*** pip install -r requirements.txt
+- ***re-generate*** pip freeze > requirements.txt
+- ***run application*** gradio app.py o python app.py
+- ***manually install missing packages*** pip install package_name
+---
+**PUBBLICAZIONE SU HUGGING FACE**
+- Create a new Space SDK GRADIO BLANK
+- git clone https://huggingface.co/spaces/StefanoDUrso/ELI #clone the EMPTY gradio project on a local HIGGINGFACE folder
+- cd into the new HUGGINGFACE folder
+- git remote add github https://github.com/paisleypark3121/ELI.git #add the GITHUB repository as new REMOTE
+- git remote -v #this to verify the remotes; this is the output:
+    - origin https://huggingface.co/spaces/StefanoDUrso/ELI (fetch)
+    - origin https://huggingface.co/spaces/StefanoDUrso/ELI (push)
+    - github https://github.com/paisleypark3121/ELI.git (fetch)
+    - github https://github.com/paisleypark3121/ELI.git (push)
+- git pull github main --allow-unrelated-histories #downloads all GITHUB code into the HUGGINGFACE folder
+- git add .
+- git commit -m "Sync GitHub with Hugging Face"
+- git push origin main
+Everytime we want to sync the code that we pushed into GITHUB we need to:
+- cd into the HUGGINGFACE folder
+- git pull github main #it pulls all the new code pushed into GITHUB
+- git push origin main #it pushes the code into HUGGINGFACE space
+---

requirements.txt ADDED Viewed

	@@ -0,0 +1,110 @@

+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.16
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+colorama==0.4.6
+dataclasses-json==0.6.7
+distro==1.9.0
+fastapi==0.115.12
+ffmpy==0.5.0
+filelock==3.18.0
+frozenlist==1.5.0
+fsspec==2025.3.2
+gradio==5.25.2
+gradio_client==1.8.0
+greenlet==3.1.1
+groovy==0.1.2
+grpcio==1.71.0
+grpcio-tools==1.71.0
+h11==0.14.0
+h2==4.2.0
+hpack==4.1.0
+httpcore==1.0.8
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.30.2
+hyperframe==6.1.0
+idna==3.10
+Jinja2==3.1.6
+jiter==0.9.0
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==3.0.0
+langchain==0.3.23
+langchain-community==0.3.21
+langchain-core==0.3.51
+langchain-openai==0.3.12
+langchain-qdrant==0.2.0
+langchain-text-splitters==0.3.8
+langsmith==0.3.30
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+marshmallow==3.26.1
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.4.3
+mypy-extensions==1.0.0
+networkx==3.4.2
+numpy==2.2.4
+openai==1.73.0
+orjson==3.10.16
+packaging==24.2
+pandas==2.2.3
+pillow==11.2.1
+portalocker==2.10.1
+propcache==0.3.1
+protobuf==5.29.4
+pydantic==2.11.3
+pydantic-settings==2.8.1
+pydantic_core==2.33.1
+pydub==0.25.1
+Pygments==2.19.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-multipart==0.0.20
+pytz==2025.2
+pywin32==310
+PyYAML==6.0.2
+qdrant-client==1.13.3
+regex==2024.11.6
+requests==2.32.3
+requests-toolbelt==1.0.0
+rich==14.0.0
+ruff==0.11.5
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+semantic-version==2.10.0
+sentence-transformers==4.1.0
+setuptools==78.1.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+SQLAlchemy==2.0.40
+starlette==0.46.2
+sympy==1.13.1
+tenacity==9.1.2
+threadpoolctl==3.6.0
+tiktoken==0.9.0
+tokenizers==0.21.1
+tomlkit==0.13.2
+torch==2.6.0
+tqdm==4.67.1
+transformers==4.51.3
+typer==0.15.2
+typing-inspect==0.9.0
+typing-inspection==0.4.0
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.1
+websockets==15.0.1
+yarl==1.19.0
+zstandard==0.23.0

utilities/llm/LlmManager.py ADDED Viewed

	@@ -0,0 +1,212 @@

+from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+from utilities.vectorstore.SummaryManager import SummaryManager
+MAX_MESSAGES = 50
+system_message_it = """
+Sei ELI, un assistente che aiuta gli studenti ad analizzare un video e rispondere a domande su di esso.
+Basandoti sul contesto fornito, rispondi alla domanda dell'utente.
+- Contesto: {context}
+- Domanda: {question}
+"""
+system_message_en = """
+You are ELI, an assistant that helps students analyze a video and answer questions about it.
+Based on the provided context, answer the user's question.
+- Context: {context}
+- Question: {question}
+"""
+def get_system_message(language="en"):
+    if language == "it":
+        return system_message_it
+    else:
+        return system_message_en
+fallback_prompt_en = """
+You are ELI, an assistant that helps students analyze a video and answer questions about it.
+Unfortunately, no relevant context could be found in the provided material.
+- If possible, try to answer based on the previous conversation.
+- Otherwise, inform the user that no verified information is available to answer the question reliably.
+- Question: {question}
+"""
+fallback_prompt_it = """
+Sei ELI, un assistente che aiuta gli studenti ad analizzare un video e rispondere a domande su di esso.
+Purtroppo, non è stato trovato alcun contesto rilevante nei materiali forniti.
+- Se possibile, prova a rispondere basandoti sulla conversazione precedente.
+- Altrimenti, informa l’utente che non hai informazioni verificate per rispondere in modo affidabile.
+- Domanda: {question}
+"""
+def get_fallback_prompt(language="en"):
+    fallback_prompts = {
+        "en": fallback_prompt_en,
+        "it": fallback_prompt_it
+    }
+    return fallback_prompts.get(language, fallback_prompts["en"])
+def get_disclaimer(context_level, language="en"):
+    disclaimers = {
+        "en": {
+            "medium": "\n\n⚠️ Note: the retrieved context has moderate similarity. The answer may not be fully reliable.",
+            "summary": "\n\n🟠 Note: the context is based on a general summary of the content. Please verify the information if needed.",
+            "low": "\n\n⚠️ No reliable information was found in the source material. The answer may rely only on the conversation.",
+            "no_context": "\n\n⚠️ No context available. The assistant will try to respond based on previous conversation, if possible."
+        },
+        "it": {
+            "medium": "\n\n⚠️ Nota: il contesto recuperato ha una similarità moderata. La risposta potrebbe non essere pienamente affidabile.",
+            "summary": "\n\n🟠 Nota: il contesto usato è un riassunto generale del contenuto. Verifica le fonti se necessario.",
+            "low": "\n\n⚠️ Nessuna informazione affidabile trovata nei materiali. La risposta potrebbe basarsi solo sulla conversazione.",
+            "no_context": "\n\n⚠️ Nessun contesto disponibile. L’assistente proverà a rispondere in base alla conversazione, se possibile."
+        }
+    }
+    return disclaimers.get(language, {}).get(context_level, "")
+class LlmManager():
+    def __init__(self, qdrant_manager=None, model="gpt-4o-mini", language="en"):
+        self.model=model
+        self.llm = ChatOpenAI(model=model, streaming=True)
+        self.language = language
+        self.system_message = get_system_message(language)
+        self.messages = []
+        self.qdrant_manager = qdrant_manager
+    def set_qdrant_manager(self, qdrant_manager):
+        self.qdrant_manager = qdrant_manager
+    def reset_messages(self, context, question):
+        self.messages = [SystemMessage(content=self.system_message.format(context=context, question=question))]
+    def stream_message(self, message, contextualize=False):
+        """
+        Streaming equivalente a send_message, con gestione dinamica del contesto e localizzazione.
+        """
+        if contextualize:
+            context = ""
+            context_level = "no_context"
+            if self.qdrant_manager and self.qdrant_manager.is_loaded():
+                context, context_level = self.qdrant_manager.get_context_for_query(message)
+                print(f"📚 Contesto recuperato [{context_level}]: {context[:100]}")
+            disclaimer = get_disclaimer(context_level, self.language)
+            base_prompt = get_system_message(self.language)
+            # Caso: contesto assente (no_context/low) ➝ prompt fallback
+            if not context.strip() and context_level in ["low", "no_context"]:
+                formatted_message = get_fallback_prompt(self.language).format(question=message)
+            else:
+                formatted_message = base_prompt.format(context=context, question=message) + disclaimer
+            # Inserisci o aggiorna il SystemMessage
+            if self.messages and isinstance(self.messages[0], SystemMessage):
+                self.messages[0] = SystemMessage(content=formatted_message)
+            else:
+                self.messages.insert(0, SystemMessage(content=formatted_message))
+            # Aggiungi il messaggio utente
+            self.messages.append(HumanMessage(content=message))
+        else:
+            self.messages.append(HumanMessage(content=message))
+        self._roll_messages()
+        response = ""
+        for chunk in self.llm.stream(self.messages):
+            response += chunk.content
+            yield {"content": chunk.content, "context_level": context_level}
+        self.messages.append(AIMessage(content=response))
+        return response, context_level
+    def send_message(self, message, contextualize=False):
+        if contextualize:
+            if not self.messages:
+                # Primo messaggio: contestualizzo
+                context = ""
+                if self.qdrant_manager and self.qdrant_manager.is_loaded():
+                    context = self.qdrant_manager.get_context_for_query(message) or ""
+                self.reset_messages(context=context, question=message)
+            else:
+                self.messages.append(HumanMessage(content=message))
+        else:
+            self.messages.append(HumanMessage(content=message))
+        self._roll_messages()
+        print("\n--- MESSAGES ---")
+        for i, msg in enumerate(self.messages):
+            role = msg.type.upper()
+            preview = msg.content.strip().replace("\n", " ")[:120]
+            print(f"{i+1:02d}. [{role}] {preview}...")
+        print("--- END ---\n")
+        response = ""
+        for chunk in self.llm.stream(self.messages):
+            response += chunk.content
+            yield chunk.content  # STREAM output to Gradio
+        # Alla fine, salva l’output completo nei messaggi
+        self.messages.append(AIMessage(content=response))
+    def initialize_conversation(self):
+        if self.qdrant_manager and self.qdrant_manager.is_loaded():
+            summary_manager = SummaryManager(
+                language=self.language,
+                qdrant_manager=self.qdrant_manager,
+                model=self.model
+            )
+            summary = summary_manager.do_initial_summary()
+            if summary:
+                preamble = {
+                    "it": "Questo contenuto tratta i seguenti temi principali:",
+                    "en": "This content covers the following main topics:"
+                }
+                follow_up = {
+                    "it": "Vuoi approfondire qualche aspetto in particolare?",
+                    "en": "Would you like to explore any of these points further?"
+                }
+                intro_message = f"{preamble.get(self.language)}\n\n{summary}\n\n{follow_up.get(self.language)}"
+                self.messages = [
+                    SystemMessage(content=self.system_message.format(context="", question="")),
+                    AIMessage(content=intro_message)
+                ]
+    def get_map_summary(self):
+        summary_manager = SummaryManager(
+            language=self.language,
+            qdrant_manager=self.qdrant_manager,
+            model=self.model
+        )
+        summary, _, _ = summary_manager.do_summary_map_reduce()
+        return summary
+    def get_stuff_summary(self):
+        summary_manager = SummaryManager(
+            language=self.language,
+            qdrant_manager=self.qdrant_manager,
+            model=self.model
+        )
+        summary, _, _ = summary_manager.do_summary_stuff()
+        return summary
+    def _roll_messages(self):
+        """
+        Keeps only the last `MAX_MESSAGES` from `messages`, excluding the first (SystemMessage).
+        The first message (SystemMessage) is always preserved.
+        """
+        if len(self.messages) > MAX_MESSAGES + 1:  # +1 to account for the SystemMessage
+            self.messages = [self.messages[0]] + self.messages[-MAX_MESSAGES:]  # Keep SystemMessage + last MAX_MESSAGES

utilities/llm/__pycache__/LlmManager.cpython-312.pyc ADDED Viewed

Binary file (10.9 kB). View file

utilities/vectorstore/QdrantLangchainManager.py ADDED Viewed

	@@ -0,0 +1,370 @@

+import os
+from dotenv import load_dotenv
+import tiktoken
+from qdrant_client import QdrantClient
+from qdrant_client.models import Distance, VectorParams, Filter, FieldCondition, MatchValue
+import torch
+from torch.nn.functional import sigmoid
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_qdrant import QdrantVectorStore
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import TextLoader, PyPDFLoader
+from langchain_core.documents import Document
+from langchain_core.messages import SystemMessage, HumanMessage
+ENCODING_NAME='o200k_base'
+class QdrantLangchainManager:
+    def __init__(self, qdrant_url, qdrant_api_key,
+                 system_message=None, crossencoder_model=None,
+                 batch_size=500, chunk_size=2000, chunk_overlap=50,
+                 vector_size=1536, re_ranking_threshold=0.7):
+        self.qdrant_url = qdrant_url
+        self.qdrant_api_key = qdrant_api_key
+        self.client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
+        self.batch_size = batch_size
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.vector_size = vector_size
+        self.re_ranking_threshold = re_ranking_threshold
+        self.reranker = crossencoder_model
+        self.collection_name = None
+        self.vectorstore = None
+    def is_loaded(self):
+        if self.collection_name is None:
+            return False
+        if self.vectorstore is None:
+            return False
+        return True
+    def create_collection(self, collection_name):
+        try:
+            if not self.client.collection_exists(collection_name):
+                self.client.create_collection(
+                    collection_name=collection_name,
+                    vectors_config=VectorParams(size=self.vector_size, distance=Distance.COSINE)
+                )
+                print(f"✅ Collection '{collection_name}' created successfully.")
+            else:
+                print(f"⚠️ Collection '{collection_name}' already exists.")
+            self.vectorstore = QdrantVectorStore(
+                client=self.client,
+                collection_name=collection_name,
+                embedding=OpenAIEmbeddings(),
+            )
+            self.collection_name = collection_name
+            return True
+        except Exception as e:
+            print(f"❌ Error creating collection '{collection_name}': {e}")
+            return False
+    def get_collection(self, collection_name):
+        try:
+            if not self.client.collection_exists(collection_name):
+                print(f"⚠️ Collection '{collection_name}' doesn't exist.")
+                return False
+            self.vectorstore = QdrantVectorStore(
+                client=self.client,
+                collection_name=collection_name,
+                embedding=OpenAIEmbeddings(),
+            )
+            self.collection_name = collection_name
+            return True
+        except Exception as e:
+            print(f"❌ Error getting collection '{collection_name}': {e}")
+            return False
+    def insert_document(self, file_path, max_tokens=None):
+        if not self.vectorstore:
+            print("⚠️ No collection initialized. Please create or load a collection first.")
+            return False, 0, None
+        try:
+            file_extension = os.path.splitext(file_path)[-1].lower()
+            file_name = os.path.basename(file_path)
+            # Select the appropriate loader
+            if file_extension == ".txt":
+                loader = TextLoader(file_path)
+            elif file_extension == ".pdf":
+                loader = PyPDFLoader(file_path)
+            else:
+                raise ValueError(f"Unsupported file type: {file_extension}")
+            # Load and split documents
+            docs = loader.load()
+            full_text = "\n".join([doc.page_content for doc in docs])
+            total_tokens = self._calculate_tokens(full_text)
+            print(f"📄 Total tokens: {total_tokens}")
+            text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+            chunks = text_splitter.split_documents(docs)
+            # Merge small chunks if needed
+            max_iterations = 3
+            iteration = 0
+            merge_performed = True
+            while merge_performed and iteration < max_iterations:
+                chunks, merge_performed = self._merge_chunks(chunks)
+                iteration += 1
+                print(f"🔄 Merge Iteration {iteration}: {len(chunks)} chunks remain.")
+            global_idx = 0
+            for i in range(0, len(chunks), self.batch_size):
+                batch = chunks[i:i + self.batch_size]
+                batch_docs = [
+                    Document(
+                        page_content=chunk.page_content,
+                        metadata={
+                            "type": "content",
+                            "file_name": file_name,
+                            "chunk_index": global_idx + j  # globale!
+                        }
+                    )
+                    for j, chunk in enumerate(batch)
+                ]
+                global_idx += len(batch)
+                self.vectorstore.add_documents(batch_docs)
+                print(f"✅ Inserted {len(batch_docs)} documents (Batch {i // self.batch_size + 1})")
+            return True, total_tokens, full_text if max_tokens is not None and total_tokens <= max_tokens else None
+        except Exception as e:
+            print(f"❌ Error inserting documents: {e}")
+            return False, 0, None
+    def insert_text(self, text, metadata=None):
+        """
+        Insert a single block of text into the vectorstore, with optional metadata.
+        """
+        if not self.vectorstore:
+            print("⚠️ No collection initialized. Please create or load a collection first.")
+            return False
+        try:
+            doc = Document(
+                page_content=text,
+                metadata=metadata or {}
+            )
+            self.vectorstore.add_documents([doc])
+            print("✅ Text inserted into vector store.")
+            return True
+        except Exception as e:
+            print(f"❌ Failed to insert text into vector store: {e}")
+            return False
+    def get_documents(self):
+        try:
+            total_points = self.client.count(self.collection_name, exact=True).count
+            if total_points == 0:
+                print("Qdrant collection is empty.")
+                return []
+            print(total_points)
+            results = self.client.scroll(
+                collection_name=self.collection_name,
+                limit=total_points,
+                with_vectors=True,
+                with_payload=True
+            )
+            return [
+                {
+                    "id": point.id,
+                    "vector": point.vector,
+                    "payload": {
+                        "page_content": point.payload.get("page_content", ""),
+                        **point.payload.get("metadata", {})  # <--- merge dei metadati qui!
+                    }
+                }
+                for point in results[0]
+            ]
+        except Exception as e:
+            print(f"Error fetching documents from Qdrant: {e}")
+            return []
+    def get_context_for_query(self, query, top_k=8):
+        print("🔎 get_context_for_query:", query)
+        if not self.vectorstore:
+            print("⚠️ No collection initialized.")
+            return "", "no_context"
+        try:
+            docs_with_scores_raw = self.vectorstore.similarity_search_with_score(query, k=top_k + 3)
+            docs_with_scores = [
+                (doc, score)
+                for doc, score in docs_with_scores_raw
+                if doc.metadata.get("type") == "content"
+            ][:top_k]
+            if not docs_with_scores:
+                print("⚠️ Nessun documento con type='content' trovato.")
+                return "", "no_context"
+            print(f"📦 Trovati {len(docs_with_scores)} documenti candidati.")
+            # Se non c'è reranker, usa soglia base
+            if not self.reranker:
+                valid_docs = [
+                    doc.page_content for doc, score in docs_with_scores
+                    if score > self.re_ranking_threshold
+                ]
+                if valid_docs:
+                    return "\n\n".join(valid_docs), "high"
+                else:
+                    return "", "low"
+            # Con reranker: calcolo probabilità normalizzate
+            reranked = self._reranking(query, docs_with_scores)
+            if not reranked:
+                print("⚠️ Reranker ha restituito risultati vuoti.")
+                return "", "no_context"
+            high_conf, medium_conf = [], []
+            for (doc, _), score in reranked:
+                if score > 0.7:
+                    high_conf.append(doc.page_content)
+                elif score > 0.3:
+                    medium_conf.append(doc.page_content)
+            if high_conf:
+                print(f"✅ {len(high_conf)} documenti con alta confidenza.")
+                return "\n\n".join(high_conf), "high"
+            elif medium_conf:
+                print(f"🟡 {len(medium_conf)} documenti con confidenza media.")
+                return "\n\n".join(medium_conf), "medium"
+            # 🔁 Fallback: cerca se esistono summary
+            print("🟠 Nessun chunk valido. Provo a usare i summary.")
+            summaries = [
+                d for d in self.get_documents()
+                if d["payload"].get("type") in ["map_summary", "stuff_summary", "initial_summary"]
+            ]
+            if not summaries:
+                print("⚠️ Nessun summary trovato.")
+                return "", "low"
+            summary_pairs = [(query, d["payload"]["page_content"]) for d in summaries]
+            raw_scores = self.reranker.predict(summary_pairs)
+            prob_scores = sigmoid(torch.tensor(raw_scores)).tolist()
+            summary_candidates = [
+                d["payload"]["page_content"]
+                for d, score in zip(summaries, prob_scores)
+                if score > 0.5
+            ]
+            if summary_candidates:
+                print("🟠 Uso il/i summary come contesto.")
+                return "\n\n".join(summary_candidates), "summary"
+            print("❌ Nessun summary supera la soglia.")
+            return "", "low"
+        except Exception as e:
+            print(f"❌ Errore nel recupero del contesto: {e}")
+            return "", "no_context"
+    def delete_collection(self, collection_name):
+        try:
+            self.client.delete_collection(collection_name)
+            print(f"🚨 Collection '{collection_name}' has been deleted.")
+            self.vectorstore = None
+            return True
+        except Exception as e:
+            print(f"❌ Error deleting collection '{collection_name}': {e}")
+            return False
+    def _merge_chunks(self, chunks, min_size=500, max_size=2000):
+        if not chunks:
+            return [], False
+        merged_chunks = []
+        temp_text = ""
+        merge_performed = False
+        if len(chunks[0].page_content) < min_size and len(chunks) > 1:
+            chunks[1] = Document(page_content=chunks[0].page_content + " " + chunks[1].page_content)
+            chunks = chunks[1:]
+            merge_performed = True
+        for chunk in chunks:
+            text = chunk.page_content
+            if not temp_text:
+                temp_text = text
+                continue
+            if len(text) < min_size:
+                temp_text += " " + text
+                merge_performed = True
+            else:
+                while len(temp_text) > max_size:
+                    merged_chunks.append(Document(page_content=temp_text[:max_size]))
+                    temp_text = temp_text[max_size:]
+                merged_chunks.append(Document(page_content=temp_text))
+                temp_text = text
+        if temp_text:
+            merged_chunks.append(Document(page_content=temp_text))
+        return merged_chunks, merge_performed
+    # def _reranking(self, query, docs_with_scores):
+    #     if not self.reranker:
+    #         print("⚠️ Reranker not initialized. Skipping reranking.")
+    #         return docs_with_scores
+    #     query_pairs = [(query, doc.page_content) for doc, _ in docs_with_scores]
+    #     new_scores = self.reranker.predict(query_pairs)
+    #     return sorted(zip(docs_with_scores, new_scores), key=lambda x: x[1], reverse=True)
+    def _reranking(self, query, docs_with_scores):
+        if not self.reranker:
+            print("⚠️ Reranker not initialized. Skipping reranking.")
+            return docs_with_scores
+        query_pairs = [(query, doc.page_content) for doc, _ in docs_with_scores]
+        # Calcola i logit grezzi
+        raw_scores = self.reranker.predict(query_pairs)
+        # Applica sigmoid per ottenere probabilità tra 0 e 1
+        prob_scores = sigmoid(torch.tensor(raw_scores)).tolist()
+        # Log per debugging
+        print("📈 Reranking scores (sigmoid-normalized):")
+        for i, ((doc, old_score), prob) in enumerate(zip(docs_with_scores, prob_scores)):
+            print(f"{i+1:02d}. chunk_index={doc.metadata.get('chunk_index', '-')}, score={prob:.3f}")
+        return sorted(zip(docs_with_scores, prob_scores), key=lambda x: x[1], reverse=True)
+    def _calculate_tokens(self, text):
+        """Calculate the number of tokens in the given text."""
+        try:
+            encoding = tiktoken.get_encoding(ENCODING_NAME)
+            return len(encoding.encode(text))
+        except Exception as e:
+            print(f"Error calculating tokens: {e}", flush=True)
+            return 0

utilities/vectorstore/SummaryManager.py ADDED Viewed

	@@ -0,0 +1,440 @@

+import os
+import json
+import requests
+from dotenv import load_dotenv
+import tiktoken
+#from langchain_openai import OpenAI
+#from langchain_community.llms import OpenAI
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
+from langchain_community.callbacks.manager import get_openai_callback
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.chains.summarize import load_summarize_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.schema import Document
+from sklearn.cluster import KMeans
+from sklearn.manifold import TSNE
+#import matplotlib.pyplot as plt
+import warnings
+from warnings import simplefilter
+import numpy as np
+#default_model='gpt-4o-mini'
+#default_embedding_model='text-embedding-3-small'
+#default_temperature=0
+#default_chunk_size=5000
+#default_chunk_overlap=1000
+#default_max_lenght=5000
+#default_separator="\n\n\n\n\n"
+'''
+On average each token contains 4 characters; 1000 tokens is about 750 words
+- chunk size 1000 -> 250 tokens
+- chunk size 2000 -> 500 tokens
+- chunk size 5000 -> 1250 tokens
+- chunk size 10000 -> 2500 tokens
+On GPT-4o-mini:
+- the price for 10k tokens in input is about 0.001€ (1k -> 0.00015)
+- the price for 10k tokens in output is about 0.006€ (1k -> 0.0006)
+- the input limit is 128k tokens
+- the output limit is 16k tokens
+'''
+map_prompt_en = """
+Please provide a summary of the following text:
+{text}
+"""
+map_prompt_it = """
+Per favore, fornisci un riassunto del seguente testo:
+{text}
+"""
+combine_prompt_en = """
+Based on the key points and ideas provided in the previous summaries, craft a final, cohesive summary that presents the information as if it were an original piece of content.
+Ensure that the summary flows smoothly, maintaining a consistent tone and style throughout. Avoid explicitly referencing the structure or order of the previous summaries, and focus on creating a well-organized and comprehensive narrative that conveys the central themes and insights clearly and naturally.
+Return the summary in Markdown format, using `###` for headings, `-` for lists, and keeping lines separated for better readability. Make sure that each item in a list starts on a new line, and that each sentence in a paragraph is separated by a blank line. Never write multiple sentences on the same line, unless they are parts of the same continuous paragraph. Make sure that each mathematical formula is written in LaTeX and enclosed in the delimiters \\( \\) for inline formulas and \\[ \\] for block formulas. Each sentence or idea should be separated by a carriage return for better readability. When displaying code, always use triple backticks (```) and specify the language (e.g. ```html, ```python, ```javascript) to ensure proper formatting and maintain proper indentation and layout.
+{text}
+"""
+combine_prompt_it = """
+Basandoti sui punti chiave e le idee fornite nei riassunti precedenti, crea un riassunto finale e coeso che presenti le informazioni come se fossero un contenuto originale.
+Assicurati che il riassunto scorra in modo fluido, mantenendo un tono e uno stile coerenti in tutto. Evita di fare riferimento esplicito alla struttura o all'ordine dei riassunti precedenti e concentrati sulla creazione di una narrazione ben organizzata e completa che trasmetta chiaramente e naturalmente i temi e le intuizioni centrali.
+Restituisci il riassunto in formato Markdown, utilizzando `###` per i titoli, `-` per gli elenchi e mantenendo le linee separate per una migliore leggibilità. Assicurati che ogni elemento di un elenco inizi su una nuova riga, e che ogni frase di un paragrafo sia separata da una riga vuota. Non scrivere mai più frasi sulla stessa riga, a meno che non siano parti dello stesso paragrafo continuo. Assicurati che ogni formula matematica sia scritta in LaTeX e racchiusa tra i delimitatori \\( \\) per le formule inline e \\[ \\] per le formule di blocco. Ogni frase o idea deve essere separata da un ritorno a capo per una migliore leggibilità. Quando mostri del codice, usa sempre i backtick tripli (```) e specifica la lingua (es. ```html, ```python, ```javascript) per garantire la corretta formattazione e mantenere l'indentazione e il layout corretti.
+{text}
+"""
+initial_summary_prompt_intro_en = """
+Read the following introductory text and write a brief summary of the main themes.
+Start with the sentence: "The content is about..." or a similar phrase, followed by a list of 3–5 key points. Use a clear and informative style.
+"""
+initial_summary_prompt_intro_it = """
+Leggi il seguente testo introduttivo e scrivi un riassunto breve dei temi principali.
+Inizia con la frase: "Il contenuto tratta di..." oppure una formula equivalente, e segui con un elenco di 3–5 punti chiave. Usa uno stile chiaro e informativo.
+"""
+initial_summary_prompt_closing_en = """
+End with a sentence indicating that this is just an introduction and the content continues.
+"""
+initial_summary_prompt_closing_it = """
+Concludi con una frase che segnali che il contenuto continua oltre questo estratto.
+"""
+initial_summary_prompt_formatting = """
+Return the summary in Markdown format, using `###` for headings, `-` for lists, and keeping lines separated for better readability. Make sure that each item in a list starts on a new line, and that each sentence in a paragraph is separated by a blank line. Never write multiple sentences on the same line, unless they are parts of the same continuous paragraph. Make sure that each mathematical formula is written in LaTeX and enclosed in the delimiters \\( \\) for inline formulas and \\[ \\] for block formulas. Each sentence or idea should be separated by a carriage return for better readability. When displaying code, always use triple backticks (```) and specify the language (e.g. ```html, ```python, ```javascript) to ensure proper formatting and maintain proper indentation and layout.
+"""
+def get_map_prompt(language):
+    if language=='it':
+        return map_prompt_it
+    else:
+        return map_prompt_en
+def get_combine_prompt(language):
+    if language=='it':
+        return combine_prompt_it
+    else:
+        return combine_prompt_en
+def get_initial_summary_prompt(language, is_partial=False):
+    if language == 'it':
+        prompt = initial_summary_prompt_intro_it
+        if is_partial:
+            prompt += "\n" + initial_summary_prompt_closing_it
+    else:
+        prompt = initial_summary_prompt_intro_en
+        if is_partial:
+            prompt += "\n" + initial_summary_prompt_closing_en
+    # Aggiungi sempre la parte sulla formattazione Markdown
+    prompt += "\n" + initial_summary_prompt_formatting
+    # Appendice per il blocco di testo
+    prompt += "\n\nText:\n{combined_text}"
+    return prompt
+# Constants
+MAX_TOTAL_TOKENS = 6000  # Safe token limit for summarization
+CHUNK_SIZE = 2000  # Define a fixed chunk size
+MAX_SELECTED_DOCS = 5
+class SummaryManager:
+    def __init__(self, language, qdrant_manager, model='gpt-4o-mini', temperature=0, max_tokens=MAX_TOTAL_TOKENS):
+        self.language = language
+        self.qdrant_manager = qdrant_manager
+        self.model = model
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        print("Model:", self.model)
+        self.llm = ChatOpenAI(model=model, temperature=temperature)
+    def do_initial_summary(self):
+        """
+        Builds a lightweight initial summary using the first 2-3 chunks of the document.
+        If an initial summary already exists in the vector store, it returns that instead.
+        """
+        # STEP 1: cerca se già esiste
+        print("🔎 Checking for existing initial summary...", flush=True)
+        all_docs = self.qdrant_manager.get_documents()
+        for doc in all_docs:
+            if doc["payload"].get("type") == "initial_summary" and doc["payload"].get("language") == self.language:
+                print("✅ Found existing initial summary in vector store.")
+                return doc["payload"]["page_content"]
+        # STEP 2: genera il riassunto se non esiste
+        print("🚀 Generating initial summary from early chunks...", flush=True)
+        if not all_docs:
+            print("⚠️ No documents available for summary.")
+            return None
+        # Filtra solo i documenti con chunk_index
+        chunk_docs = [doc for doc in all_docs if 'chunk_index' in doc["payload"]]
+        total_chunks = len(chunk_docs)
+        # Prendi i primi 3 chunk ordinati
+        selected_chunks = sorted(chunk_docs, key=lambda d: d["payload"]["chunk_index"])[:3]
+        is_partial = len(selected_chunks) < total_chunks
+        combined_text = "\n".join([doc["payload"]["page_content"] for doc in selected_chunks])[:3000]
+        # Prompt dinamico
+        prompt_template = get_initial_summary_prompt(self.language, is_partial=is_partial)
+        prompt = prompt_template.format(combined_text=combined_text)
+        try:
+            from langchain_core.messages import HumanMessage
+            response = self.llm.invoke([HumanMessage(content=prompt)])
+            summary = response.content
+            # STEP 3: salva il riassunto nel vector store
+            inserted = self.qdrant_manager.insert_text(
+                text=summary,
+                metadata={
+                    "type": "initial_summary",
+                    "file_name": self.qdrant_manager.collection_name,
+                    "language": self.language
+                }
+            )
+            if inserted:
+                print("📝 Initial summary saved to vector store.")
+            return summary
+        except Exception as e:
+            print(f"❌ Error generating initial summary: {e}")
+            return None
+    def do_summary_map_reduce(self):
+        """
+        Returns a full summary using Map-Reduce summarization.
+        If a final summary already exists in the vector store, returns that instead.
+        """
+        # STEP 1: check if summary already exists
+        print("🔎 Checking for existing final summary...", flush=True)
+        all_documents = self.qdrant_manager.get_documents()
+        for doc in all_documents:
+            if doc["payload"].get("type") == "map_summary" and doc["payload"].get("language") == self.language:
+                print("✅ Found existing final summary in vector store.")
+                return doc["payload"]["page_content"], 0, 0
+        if not all_documents:
+            print("❌ No documents found in collection.")
+            return None, 0, 0
+        print(len(all_documents), flush=True)
+        # STEP 2: extract vectors & text
+        embeddings = [doc["vector"] for doc in all_documents]
+        documents = [doc["payload"]["page_content"] for doc in all_documents]
+        metadata = [doc["payload"] for doc in all_documents]
+        # STEP 3: select up to MAX_SELECTED_DOCS chunks via KMeans
+        MAX_SELECTED_DOCS = 5
+        selected_docs = self._select_best_chunks(
+            documents=documents,
+            metadata=metadata,
+            embeddings=embeddings,
+            max_chunks=MAX_SELECTED_DOCS
+        )
+        total_tokens = sum(self.llm.get_num_tokens(doc.page_content) for doc in selected_docs)
+        print(f"✅ Selected {len(selected_docs)} docs with total tokens: {total_tokens}")
+        # STEP 4: load LangChain prompts
+        map_prompt_template = PromptTemplate(template=get_map_prompt(self.language), input_variables=["text"])
+        combine_prompt_template = PromptTemplate(template=get_combine_prompt(self.language), input_variables=["text"])
+        print("🔄 Loading summarization chain...")
+        summary_chain = load_summarize_chain(
+            llm=self.llm,
+            chain_type='map_reduce',
+            map_prompt=map_prompt_template,
+            combine_prompt=combine_prompt_template,
+            verbose=False
+        )
+        print("📊 Checking token size of each formatted_doc...")
+        for i, doc in enumerate(selected_docs):
+            token_count = self.llm.get_num_tokens(doc.page_content)
+            print(f"Chunk {i+1}: {token_count} tokens")
+        # STEP 5: run the chain with token tracking
+        with get_openai_callback() as cb:
+            result = summary_chain.invoke({"input_documents": selected_docs})
+            input_tokens_used = cb.prompt_tokens
+            output_tokens_used = cb.completion_tokens
+            total_tokens = cb.total_tokens
+            print(f"🧾 Token usage: total={total_tokens}")
+        full_summary = result['output_text']
+        print("✅ Map-reduce summary generated.")
+        # STEP 6: store the final summary
+        inserted = self.qdrant_manager.insert_text(
+            text=full_summary,
+            metadata={
+                "type": "map_summary",
+                "file_name": self.qdrant_manager.collection_name,
+                "language": self.language
+            }
+        )
+        if inserted:
+            print("📝 Final summary saved to vector store.")
+        return full_summary, input_tokens_used, output_tokens_used
+    def do_summary_stuff(self):
+        """
+        Returns a full summary using STUFF summarization strategy.
+        Uses all documents if total token count is within limits,
+        otherwise selects the best subset under token budget.
+        """
+        # STEP 1: check if summary already exists
+        print("🔎 Checking for existing final summary...", flush=True)
+        all_documents = self.qdrant_manager.get_documents()
+        for doc in all_documents:
+            if doc["payload"].get("type") == "stuff_summary" and doc["payload"].get("language") == self.language:
+                print("✅ Found existing final summary in vector store.")
+                return doc["payload"]["page_content"], 0, 0
+        if not all_documents:
+            print("❌ No documents found in collection.")
+            return None, 0, 0
+        #print(len(all_documents), flush=True)
+        # STEP 2: extract vectors & text
+        embeddings = [doc["vector"] for doc in all_documents]
+        documents = [doc["payload"]["page_content"] for doc in all_documents]
+        metadata = [doc["payload"] for doc in all_documents]
+        # STEP 3: selezione intelligente con fallback a clustering
+        selected_docs = self._get_chunks_for_stuff(
+            documents=documents,
+            metadata=metadata,
+            embeddings=embeddings,
+            llm=self.llm,
+            max_tokens=self.max_tokens,
+            fallback_max_chunks=5
+        )
+        total_tokens = sum(self.llm.get_num_tokens(doc.page_content) for doc in selected_docs)
+        print(f"✅ Selected {len(selected_docs)} docs with total tokens: {total_tokens}")
+        # STEP 4: load chain
+        combine_prompt_template = PromptTemplate(
+            template=get_combine_prompt(self.language),
+            input_variables=["text"]
+        )
+        print("🔄 Running summarization with 'stuff' strategy...")
+        summary_chain = load_summarize_chain(
+            llm=self.llm,
+            chain_type='stuff',
+            prompt=combine_prompt_template,
+            verbose=False
+        )
+        # STEP 5: run the chain with token tracking
+        with get_openai_callback() as cb:
+            result = summary_chain.invoke({"input_documents": selected_docs})
+            input_tokens_used = cb.prompt_tokens
+            output_tokens_used = cb.completion_tokens
+            total_tokens = cb.total_tokens
+            print(f"🧾 Token usage: total={total_tokens}")
+        full_summary = result['output_text']
+        print("✅ Stuff summary generated.")
+        # STEP 6: store the final summary
+        inserted = self.qdrant_manager.insert_text(
+            text=full_summary,
+            metadata={
+                "type": "stuff_summary",
+                "file_name": self.qdrant_manager.collection_name,
+                "language": self.language
+            }
+        )
+        if inserted:
+            print("📝 Final summary saved to vector store.")
+        return full_summary, input_tokens_used, output_tokens_used
+    def _find_closest_embeddings(self,vectors, num_clusters, kmeans):
+        closest_indices = []
+        for i in range(num_clusters):
+            distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)
+            closest_index = np.argmin(distances)
+            closest_indices.append(closest_index)
+        selected_indices = sorted(closest_indices)
+        return selected_indices
+    def _count_tokens(text,model):
+        if not text:
+            return 0
+        encoding = tiktoken.encoding_for_model(model)
+        num_tokens = len(encoding.encode(text))
+        return num_tokens
+    def _select_best_chunks(self, documents, metadata, embeddings, max_chunks=5):
+        assert len(documents) == len(metadata) == len(embeddings)
+        num_clusters = min(max_chunks, len(documents))
+        print(f"📌 Selecting {num_clusters} best chunks (max={max_chunks})")
+        if num_clusters > 1:
+            kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(embeddings)
+            indices = self._find_closest_embeddings(embeddings, num_clusters, kmeans)
+        else:
+            indices = list(range(min(len(documents), max_chunks)))
+        selected = [(documents[i], metadata[i]) for i in indices[:max_chunks]]
+        # Ordina per chunk_index
+        selected = sorted(selected, key=lambda x: x[1].get("chunk_index", 0))
+        return [Document(page_content=d, metadata=m) for d, m in selected]
+    def _select_best_chunks_under_token_budget(self, documents, metadata, embeddings, llm, max_tokens, max_chunks=10):
+        assert len(documents) == len(metadata) == len(embeddings)
+        if max_chunks > len(documents):
+            max_chunks = len(documents)
+        kmeans = KMeans(n_clusters=max_chunks, random_state=42).fit(embeddings)
+        indices = self._find_closest_embeddings(embeddings, max_chunks, kmeans)
+        selected = []
+        total_tokens = 0
+        for i in indices:
+            doc_text = documents[i]
+            token_count = llm.get_num_tokens(doc_text)
+            if total_tokens + token_count > max_tokens:
+                break
+            selected.append((doc_text, metadata[i]))
+            total_tokens += token_count
+        selected = sorted(selected, key=lambda x: x[1].get("chunk_index", 0))
+        return [Document(page_content=d, metadata=m) for d, m in selected]
+    def _get_chunks_for_stuff(self, documents, metadata, embeddings, llm, max_tokens, fallback_max_chunks=5):
+        total_tokens = sum(llm.get_num_tokens(d) for d in documents)
+        if total_tokens <= max_tokens:
+            print(f"✅ Using ALL documents ({total_tokens} tokens)")
+            ordered = sorted(zip(documents, metadata), key=lambda x: x[1].get("chunk_index", 0))
+            return [Document(page_content=d, metadata=m) for d, m in ordered]
+        else:
+            print(f"⚠️ Total tokens {total_tokens} exceed max {max_tokens} — selecting best subset")
+            return self._select_best_chunks_under_token_budget(
+                documents=documents,
+                metadata=metadata,
+                embeddings=embeddings,
+                llm=llm,
+                max_tokens=max_tokens,
+                max_chunks=fallback_max_chunks
+            )

utilities/vectorstore/__pycache__/QdrantLangchainManager.cpython-312.pyc ADDED Viewed

Binary file (16.1 kB). View file

utilities/vectorstore/__pycache__/SummaryManager.cpython-312.pyc ADDED Viewed

Binary file (21.5 kB). View file

utils.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from config import initialize
+from utilities.vectorstore.SummaryManager import SummaryManager
+def manage_collection(file_name, collection_name):
+    if not qdrant_manager.get_collection(collection_name):
+        if not qdrant_manager.create_collection(collection_name):
+            print("❌ Error: Failed to create collection in Qdrant. Exiting application.", flush=True)
+            return
+        success, total_tokens, text = qdrant_manager.insert_document(file_name)
+        if success:
+            print(f"✅ Documento inserito correttamente. Token totali: {total_tokens}")
+            if text:
+                print(f"✅ Testo completo disponibile (entro il limite token): {text[:100]}...")
+        else:
+            print("❌ Errore durante l'inserimento del documento.")
+    #qdrant_manager.delete_collection(collection_name)
+def get_initial_summary(collection_name):
+    """Retrieve initial summary from a Qdrant collection."""
+    # Carica la collection se esiste
+    if not qdrant_manager.get_collection(collection_name):
+        print(f"❌ Collection '{collection_name}' non trovata.")
+        return None
+    # Inizializza il SummaryManager
+    summary_manager = SummaryManager(language="en", qdrant_manager=qdrant_manager)
+    # Genera il riassunto iniziale
+    return summary_manager.do_initial_summary()
+def get_summary(collection_name, type="map_reduce"):
+    """Retrieve initial summary from a Qdrant collection."""
+    # Carica la collection se esiste
+    if not qdrant_manager.get_collection(collection_name):
+        print(f"❌ Collection '{collection_name}' non trovata.")
+        return None
+    # Inizializza il SummaryManager
+    summary_manager = SummaryManager(language="en", qdrant_manager=qdrant_manager)
+    if type == "map_reduce":
+        return summary_manager.do_summary_map_reduce()
+    elif type == "stuff":
+        print("Using stuff method")
+        return summary_manager.do_summary_stuff()
+    else:
+        return None
+def chat_with_bot(llm_manager, contextualize=True):
+    print("🤖 Chatbot! Write 'exit' or 'quit' to close the conversation.\n")
+    try:
+        if contextualize:
+            llm_manager.initialize_conversation()
+            print(f"🤖 ELI: {llm_manager.messages[-1].content}\n")
+    except Exception as e:
+        print(f"⚠️ Could not load initial summary: {e}\n")
+    while True:
+        try:
+            user_input = input("👤 You: ")
+            if user_input.lower() in ["exit", "quit"]:
+                print("👋 End of conversation.")
+                break
+            response = llm_manager.send_message(user_input, contextualize=contextualize)
+            print(f"🤖 ELI: {response}\n")
+        except KeyboardInterrupt:
+            print("\n👋 Conversation stopped.")
+            break
+        except Exception as e:
+            print(f"⚠️ Error: {e}\n")
+llm_manager, qdrant_manager = initialize()
+# file_name="data/txt/Key statisitcs startups.txt"
+collection_name="key_statistics"
+llm_manager, qdrant_manager = initialize()
+if qdrant_manager.get_collection(collection_name):
+    llm_manager.set_qdrant_manager(qdrant_manager)
+chat_with_bot(llm_manager)
+#manage_collection(file_name, collection_name)
+#summary=get_initial_summary(collection_name)
+#summary=get_summary(collection_name,"map_reduce")
+#summary=get_summary(collection_name,type="stuff")
+# if summary:
+#     print(f"✅ Summary:\n{summary}")
+# else:
+#     print("⚠️ Nessun riassunto generato.")