1. SyntheticDataAttributes

class select_ai.SyntheticDataAttributes(object_name: str | None = None, object_list: List[Mapping] | None = None, owner_name: str | None = None, params: SyntheticDataParams | None = None, record_count: int | None = None, user_prompt: str | None = None)

Attributes to control generation of synthetic data

Parameters:
  • object_name (str) – Table name to populate synthetic data

  • object_list (List[Mapping]) – Use this to generate synthetic data on multiple tables

  • owner_name (str) – Database user who owns the referenced object. Default value is connected user’s schema

  • record_count (int) – Number of records to generate

  • user_prompt (str) – User prompt to guide generation of synthetic data For e.g. “the release date for the movies should be in 2019”

2. SyntheticDataParams

class select_ai.SyntheticDataParams(sample_rows: int | None = None, table_statistics: bool | None = False, priority: str | None = 'HIGH', comments: bool | None = False)

Optional parameters to control generation of synthetic data

Parameters:
  • sample_rows (int) – number of rows from the table to use as a sample to guide the LLM in data generation

  • table_statistics (bool) – Enable or disable the use of table statistics information. Default value is False

  • priority (str) – Assign a priority value that defines the number of parallel requests sent to the LLM for generating synthetic data. Tasks with a higher priority will consume more database resources and complete faster. Possible values are: HIGH, MEDIUM, LOW

  • comments (bool) – Enable or disable sending comments to the LLM to guide data generation. Default value is False

Also, check the generate_synthetic_data PL/SQL API for attribute details

3. Single table synthetic data

The below example shows single table synthetic data generation

3.1. Sync API

import os

import select_ai

user = os.getenv("SELECT_AI_USER")
password = os.getenv("SELECT_AI_PASSWORD")
dsn = os.getenv("SELECT_AI_DB_CONNECT_STRING")

select_ai.connect(user=user, password=password, dsn=dsn)
profile = select_ai.Profile(profile_name="oci_ai_profile")
synthetic_data_params = select_ai.SyntheticDataParams(
    sample_rows=100, table_statistics=True, priority="HIGH"
)
synthetic_data_attributes = select_ai.SyntheticDataAttributes(
    object_name="MOVIE",
    user_prompt="the release date for the movies should be in 2019",
    params=synthetic_data_params,
    record_count=100,
)
profile.generate_synthetic_data(
    synthetic_data_attributes=synthetic_data_attributes
)

output:

SQL> select count(*) from movie;

  COUNT(*)
----------
       100

3.2. Async API

import asyncio
import os

import select_ai

user = os.getenv("SELECT_AI_USER")
password = os.getenv("SELECT_AI_PASSWORD")
dsn = os.getenv("SELECT_AI_DB_CONNECT_STRING")


async def main():
    await select_ai.async_connect(user=user, password=password, dsn=dsn)
    async_profile = await select_ai.AsyncProfile(
        profile_name="async_oci_ai_profile",
    )
    synthetic_data_params = select_ai.SyntheticDataParams(
        sample_rows=100, table_statistics=True, priority="HIGH"
    )
    synthetic_data_attributes = select_ai.SyntheticDataAttributes(
        object_name="MOVIE",
        user_prompt="the release date for the movies should be in 2019",
        params=synthetic_data_params,
        record_count=100,
    )
    await async_profile.generate_synthetic_data(
        synthetic_data_attributes=synthetic_data_attributes
    )


asyncio.run(main())

output:

SQL> select count(*) from movie;

  COUNT(*)
----------
       100

4. Multi table synthetic data

The below example shows multitable synthetic data generation

4.1. Sync API

import os

import select_ai

user = os.getenv("SELECT_AI_USER")
password = os.getenv("SELECT_AI_PASSWORD")
dsn = os.getenv("SELECT_AI_DB_CONNECT_STRING")

select_ai.connect(user=user, password=password, dsn=dsn)
profile = select_ai.Profile(profile_name="oci_ai_profile")
synthetic_data_params = select_ai.SyntheticDataParams(
    sample_rows=100, table_statistics=True, priority="HIGH"
)
object_list = [
    {
        "owner": user,
        "name": "MOVIE",
        "record_count": 100,
        "user_prompt": "the release date for the movies should be in 2019",
    },
    {"owner": user, "name": "ACTOR", "record_count": 10},
    {"owner": user, "name": "DIRECTOR", "record_count": 5},
]
synthetic_data_attributes = select_ai.SyntheticDataAttributes(
    object_list=object_list, params=synthetic_data_params
)
profile.generate_synthetic_data(
    synthetic_data_attributes=synthetic_data_attributes
)

output:

SQL> select count(*) from actor;

  COUNT(*)
----------
    40

SQL> select count(*) from director;

  COUNT(*)
----------
    13

SQL> select count(*) from movie;

  COUNT(*)
----------
       300

4.2. Async API

import asyncio
import os

import select_ai

user = os.getenv("SELECT_AI_USER")
password = os.getenv("SELECT_AI_PASSWORD")
dsn = os.getenv("SELECT_AI_DB_CONNECT_STRING")


async def main():
    await select_ai.async_connect(user=user, password=password, dsn=dsn)
    async_profile = await select_ai.AsyncProfile(
        profile_name="async_oci_ai_profile",
    )
    synthetic_data_params = select_ai.SyntheticDataParams(
        sample_rows=100, table_statistics=True, priority="HIGH"
    )
    object_list = [
        {
            "owner": user,
            "name": "MOVIE",
            "record_count": 100,
            "user_prompt": "the release date for the movies should be in 2019",
        },
        {"owner": user, "name": "ACTOR", "record_count": 10},
        {"owner": user, "name": "DIRECTOR", "record_count": 5},
    ]
    synthetic_data_attributes = select_ai.SyntheticDataAttributes(
        object_list=object_list, params=synthetic_data_params
    )
    await async_profile.generate_synthetic_data(
        synthetic_data_attributes=synthetic_data_attributes
    )


asyncio.run(main())

output:

SQL> select count(*) from actor;

  COUNT(*)
----------
    40

SQL> select count(*) from director;

  COUNT(*)
----------
    13

SQL> select count(*) from movie;

  COUNT(*)
----------
       300