LLM for data filtering¶

This model is a simple demonstration of how to use an LLM in a Plugboard model. In this case, we're going to use it to filter noisy data. The input.csv contains a sample of some temperature data that has been corrupted by various errors. We use the LLM to make corrections to the data where necessary.

To run this model you will need to set the OPENAI_API_KEY environment variable.

In [ ]:

Copied!





import os
from getpass import getpass

import pandas as pd
from pydantic import BaseModel

from plugboard.connector import AsyncioConnector
from plugboard.schemas import ConnectorSpec
from plugboard.process import LocalProcess
from plugboard.library import FileReader, FileWriter, LLMChat
import os
from getpass import getpass

import pandas as pd
from pydantic import BaseModel

from plugboard.connector import AsyncioConnector
from plugboard.schemas import ConnectorSpec
from plugboard.process import LocalProcess
from plugboard.library import FileReader, FileWriter, LLMChat

In [ ]:

Copied!

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key: ")

The FileReader and FileWriter components are provided by plugboard: set the up to load the input CSV file and save the model result to output.csv.

In [ ]:

Copied!





input_data = FileReader(name="input_data", path="input.csv", field_names=["temperature"])
output_data = FileWriter(
    name="output_data",
    path="output.csv",
    field_names=["raw_temperature", "corrected_temperature", "was_corrected"],
)
input_data = FileReader(name="input_data", path="input.csv", field_names=["temperature"])
output_data = FileWriter(
    name="output_data",
    path="output.csv",
    field_names=["raw_temperature", "corrected_temperature", "was_corrected"],
)

For the noise filter, we need to set up an LLMChat component to correct the temperature readings. To do this we need:

A Pydantic response model to specify the format we would like the output in;
A system prompt that provides instructions to the LLM about how we would like the data corrected;
Configuration on LLMChat to keep context in the chat history, so that the model knows about previous values of the temperature that it has seen.

In [ ]:

Copied!





class CleanTemperature(BaseModel):
    temperature: float
    was_corrected: bool


system_prompt = """
You are going to receive temperature values read from a sensor. These frequently contain errors that need to be corrected.
Example errors are: missing decimal point, missing digit, decimal point in the wrong place, etc.
You need to correct the temperature values and indicate whether they were corrected or not.
For context, the temperature values are in Celsius and are not expected to change more than 2 degrees between readings.
If you cannot tell what the correct value should be you should output the last known correct value.
"""

llm = LLMChat(
    name="llm",
    system_prompt=system_prompt,
    # This needs GPT-4o or similar to work well
    llm_kwargs={"model": "gpt-4o"},
    response_model=CleanTemperature,
    # Expand the response into separate fields: llm.temperature and llm.was_corrected
    expand_response=True,
    # Include context so that the model can use the last known correct value
    context_window=5,
)
class CleanTemperature(BaseModel):
    temperature: float
    was_corrected: bool


system_prompt = """
You are going to receive temperature values read from a sensor. These frequently contain errors that need to be corrected.
Example errors are: missing decimal point, missing digit, decimal point in the wrong place, etc.
You need to correct the temperature values and indicate whether they were corrected or not.
For context, the temperature values are in Celsius and are not expected to change more than 2 degrees between readings.
If you cannot tell what the correct value should be you should output the last known correct value.
"""

llm = LLMChat(
    name="llm",
    system_prompt=system_prompt,
    # This needs GPT-4o or similar to work well
    llm_kwargs={"model": "gpt-4o"},
    response_model=CleanTemperature,
    # Expand the response into separate fields: llm.temperature and llm.was_corrected
    expand_response=True,
    # Include context so that the model can use the last known correct value
    context_window=5,
)

Now connect the components together in a LocalProcess.

In [ ]:

Copied!





process = LocalProcess(
    components=[input_data, llm, output_data],
    connectors=[
        # Connect input_data to LLM
        AsyncioConnector(
            spec=ConnectorSpec(source="input_data.temperature", target="llm.prompt"),
        ),
        # Connect both the raw input and LLM output to the output_data
        AsyncioConnector(
            spec=ConnectorSpec(
                source="input_data.temperature", target="output_data.raw_temperature"
            )
        ),
        AsyncioConnector(
            spec=ConnectorSpec(source="llm.temperature", target="output_data.corrected_temperature")
        ),
        AsyncioConnector(
            spec=ConnectorSpec(source="llm.was_corrected", target="output_data.was_corrected")
        ),
    ],
)
process = LocalProcess(
    components=[input_data, llm, output_data],
    connectors=[
        # Connect input_data to LLM
        AsyncioConnector(
            spec=ConnectorSpec(source="input_data.temperature", target="llm.prompt"),
        ),
        # Connect both the raw input and LLM output to the output_data
        AsyncioConnector(
            spec=ConnectorSpec(
                source="input_data.temperature", target="output_data.raw_temperature"
            )
        ),
        AsyncioConnector(
            spec=ConnectorSpec(source="llm.temperature", target="output_data.corrected_temperature")
        ),
        AsyncioConnector(
            spec=ConnectorSpec(source="llm.was_corrected", target="output_data.was_corrected")
        ),
    ],
)

Now we can initialise and run the simulation.

In [ ]:

Copied!

async with process:
    await process.run()
async with process:
    await process.run()

Now take a look at the data in output.csv and see how the model did.

In [ ]:

Copied!

pd.read_csv("output.csv")
pd.read_csv("output.csv")