Crawler

Manages a Glue Crawler. More information can be found in the AWS Glue Developer Guide

Example Usage

DynamoDB Target

using Pulumi;
using Aws = Pulumi.Aws;

class MyStack : Stack
{
    public MyStack()
    {
        var example = new Aws.Glue.Crawler("example", new Aws.Glue.CrawlerArgs
        {
            DatabaseName = aws_glue_catalog_database.Example.Name,
            DynamodbTargets = 
            {
                new Aws.Glue.Inputs.CrawlerDynamodbTargetArgs
                {
                    Path = "table-name",
                },
            },
            Role = aws_iam_role.Example.Arn,
        });
    }

}

package main

import (
    "github.com/pulumi/pulumi-aws/sdk/v2/go/aws/glue"
    "github.com/pulumi/pulumi/sdk/v2/go/pulumi"
)

func main() {
    pulumi.Run(func(ctx *pulumi.Context) error {
        _, err := glue.NewCrawler(ctx, "example", &glue.CrawlerArgs{
            DatabaseName: pulumi.String(aws_glue_catalog_database.Example.Name),
            DynamodbTargets: glue.CrawlerDynamodbTargetArray{
                &glue.CrawlerDynamodbTargetArgs{
                    Path: pulumi.String("table-name"),
                },
            },
            Role: pulumi.String(aws_iam_role.Example.Arn),
        })
        if err != nil {
            return err
        }
        return nil
    })
}

import pulumi
import pulumi_aws as aws

example = aws.glue.Crawler("example",
    database_name=aws_glue_catalog_database["example"]["name"],
    dynamodb_targets=[{
        "path": "table-name",
    }],
    role=aws_iam_role["example"]["arn"])

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Crawler("example", {
    databaseName: aws_glue_catalog_database_example.name,
    dynamodbTargets: [{
        path: "table-name",
    }],
    role: aws_iam_role_example.arn,
});

JDBC Target

using Pulumi;
using Aws = Pulumi.Aws;

class MyStack : Stack
{
    public MyStack()
    {
        var example = new Aws.Glue.Crawler("example", new Aws.Glue.CrawlerArgs
        {
            DatabaseName = aws_glue_catalog_database.Example.Name,
            JdbcTargets = 
            {
                new Aws.Glue.Inputs.CrawlerJdbcTargetArgs
                {
                    ConnectionName = aws_glue_connection.Example.Name,
                    Path = "database-name/%",
                },
            },
            Role = aws_iam_role.Example.Arn,
        });
    }

}

package main

import (
    "fmt"

    "github.com/pulumi/pulumi-aws/sdk/v2/go/aws/glue"
    "github.com/pulumi/pulumi/sdk/v2/go/pulumi"
)

func main() {
    pulumi.Run(func(ctx *pulumi.Context) error {
        _, err := glue.NewCrawler(ctx, "example", &glue.CrawlerArgs{
            DatabaseName: pulumi.String(aws_glue_catalog_database.Example.Name),
            JdbcTargets: glue.CrawlerJdbcTargetArray{
                &glue.CrawlerJdbcTargetArgs{
                    ConnectionName: pulumi.String(aws_glue_connection.Example.Name),
                    Path:           pulumi.String(fmt.Sprintf("%v%v", "database-name/", "%")),
                },
            },
            Role: pulumi.String(aws_iam_role.Example.Arn),
        })
        if err != nil {
            return err
        }
        return nil
    })
}

import pulumi
import pulumi_aws as aws

example = aws.glue.Crawler("example",
    database_name=aws_glue_catalog_database["example"]["name"],
    jdbc_targets=[{
        "connectionName": aws_glue_connection["example"]["name"],
        "path": "database-name/%",
    }],
    role=aws_iam_role["example"]["arn"])

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Crawler("example", {
    databaseName: aws_glue_catalog_database_example.name,
    jdbcTargets: [{
        connectionName: aws_glue_connection_example.name,
        path: "database-name/%",
    }],
    role: aws_iam_role_example.arn,
});

S3 Target

using Pulumi;
using Aws = Pulumi.Aws;

class MyStack : Stack
{
    public MyStack()
    {
        var example = new Aws.Glue.Crawler("example", new Aws.Glue.CrawlerArgs
        {
            DatabaseName = aws_glue_catalog_database.Example.Name,
            Role = aws_iam_role.Example.Arn,
            S3Targets = 
            {
                new Aws.Glue.Inputs.CrawlerS3TargetArgs
                {
                    Path = $"s3://{aws_s3_bucket.Example.Bucket}",
                },
            },
        });
    }

}

package main

import (
    "fmt"

    "github.com/pulumi/pulumi-aws/sdk/v2/go/aws/glue"
    "github.com/pulumi/pulumi/sdk/v2/go/pulumi"
)

func main() {
    pulumi.Run(func(ctx *pulumi.Context) error {
        _, err := glue.NewCrawler(ctx, "example", &glue.CrawlerArgs{
            DatabaseName: pulumi.String(aws_glue_catalog_database.Example.Name),
            Role:         pulumi.String(aws_iam_role.Example.Arn),
            S3Targets: glue.CrawlerS3TargetArray{
                &glue.CrawlerS3TargetArgs{
                    Path: pulumi.String(fmt.Sprintf("%v%v", "s3://", aws_s3_bucket.Example.Bucket)),
                },
            },
        })
        if err != nil {
            return err
        }
        return nil
    })
}

import pulumi
import pulumi_aws as aws

example = aws.glue.Crawler("example",
    database_name=aws_glue_catalog_database["example"]["name"],
    role=aws_iam_role["example"]["arn"],
    s3_targets=[{
        "path": f"s3://{aws_s3_bucket['example']['bucket']}",
    }])

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Crawler("example", {
    databaseName: aws_glue_catalog_database_example.name,
    role: aws_iam_role_example.arn,
    s3Targets: [{
        path: pulumi.interpolate`s3://${aws_s3_bucket_example.bucket}`,
    }],
});

Create a Crawler Resource

new Crawler(name: string, args: CrawlerArgs, opts?: CustomResourceOptions);

def Crawler(resource_name, opts=None, catalog_targets=None, classifiers=None, configuration=None, database_name=None, description=None, dynamodb_targets=None, jdbc_targets=None, name=None, role=None, s3_targets=None, schedule=None, schema_change_policy=None, security_configuration=None, table_prefix=None, tags=None, __props__=None);

func NewCrawler(ctx *Context, name string, args CrawlerArgs, opts ...ResourceOption) (*Crawler, error)

public Crawler(string name, CrawlerArgs args, CustomResourceOptions? opts = null)

name string: The unique name of the resource.
args CrawlerArgs: The arguments to resource properties.
opts CustomResourceOptions: Bag of options to control resource's behavior.

resource_name str: The unique name of the resource.
opts ResourceOptions: A bag of options that control this resource's behavior.

ctx Context: Context object for the current deployment.
name string: The unique name of the resource.
args CrawlerArgs: The arguments to resource properties.
opts ResourceOption: Bag of options to control resource's behavior.

name string: The unique name of the resource.
args CrawlerArgs: The arguments to resource properties.
opts CustomResourceOptions: Bag of options to control resource's behavior.

Crawler Resource Properties

To learn more about resource properties and how to use them, see Inputs and Outputs in the Programming Model docs.

Inputs

The Crawler resource accepts the following input properties:

DatabaseName string: Glue database where results are written.
Role string: The IAM role friendly name (including path without leading slash), or ARN of an IAM role, used by the crawler to access other resources.
CatalogTargets List<CrawlerCatalogTargetArgs>
Classifiers List<string>: List of custom classifiers. By default, all AWS classifiers are included in a crawl, but these custom classifiers always override the default classifiers for a given classification.
Configuration string: JSON string of configuration information.
Description string: Description of the crawler.
DynamodbTargets List<CrawlerDynamodbTargetArgs>: List of nested DynamoDB target arguments. See below.
JdbcTargets List<CrawlerJdbcTargetArgs>: List of nested JBDC target arguments. See below.
Name string: Name of the crawler.
S3Targets List<CrawlerS3TargetArgs>: List nested Amazon S3 target arguments. See below.
Schedule string: A cron expression used to specify the schedule. For more information, see Time-Based Schedules for Jobs and Crawlers. For example, to run something every day at 12:15 UTC, you would specify: cron(15 12 * * ? *).
SchemaChangePolicy CrawlerSchemaChangePolicyArgs: Policy for the crawler’s update and deletion behavior.
SecurityConfiguration string: The name of Security Configuration to be used by the crawler
TablePrefix string: The table prefix used for catalog tables that are created.
Tags Dictionary<string, string>: Key-value map of resource tags

DatabaseName string: Glue database where results are written.
Role string: The IAM role friendly name (including path without leading slash), or ARN of an IAM role, used by the crawler to access other resources.
CatalogTargets []CrawlerCatalogTarget
Classifiers []string: List of custom classifiers. By default, all AWS classifiers are included in a crawl, but these custom classifiers always override the default classifiers for a given classification.
Configuration string: JSON string of configuration information.
Description string: Description of the crawler.
DynamodbTargets []CrawlerDynamodbTarget: List of nested DynamoDB target arguments. See below.
JdbcTargets []CrawlerJdbcTarget: List of nested JBDC target arguments. See below.
Name string: Name of the crawler.
S3Targets []CrawlerS3Target: List nested Amazon S3 target arguments. See below.
Schedule string: A cron expression used to specify the schedule. For more information, see Time-Based Schedules for Jobs and Crawlers. For example, to run something every day at 12:15 UTC, you would specify: cron(15 12 * * ? *).
SchemaChangePolicy CrawlerSchemaChangePolicy: Policy for the crawler’s update and deletion behavior.
SecurityConfiguration string: The name of Security Configuration to be used by the crawler
TablePrefix string: The table prefix used for catalog tables that are created.
Tags map[string]string: Key-value map of resource tags

databaseName string: Glue database where results are written.
role string: The IAM role friendly name (including path without leading slash), or ARN of an IAM role, used by the crawler to access other resources.
catalogTargets CrawlerCatalogTarget[]
classifiers string[]: List of custom classifiers. By default, all AWS classifiers are included in a crawl, but these custom classifiers always override the default classifiers for a given classification.
configuration string: JSON string of configuration information.
description string: Description of the crawler.
dynamodbTargets CrawlerDynamodbTarget[]: List of nested DynamoDB target arguments. See below.
jdbcTargets CrawlerJdbcTarget[]: List of nested JBDC target arguments. See below.
name string: Name of the crawler.
s3Targets CrawlerS3Target[]: List nested Amazon S3 target arguments. See below.
schedule string: A cron expression used to specify the schedule. For more information, see Time-Based Schedules for Jobs and Crawlers. For example, to run something every day at 12:15 UTC, you would specify: cron(15 12 * * ? *).
schemaChangePolicy CrawlerSchemaChangePolicy: Policy for the crawler’s update and deletion behavior.
securityConfiguration string: The name of Security Configuration to be used by the crawler
tablePrefix string: The table prefix used for catalog tables that are created.
tags {[key: string]: string}: Key-value map of resource tags

database_name str: Glue database where results are written.
role str: The IAM role friendly name (including path without leading slash), or ARN of an IAM role, used by the crawler to access other resources.
catalog_targets List[CrawlerCatalogTarget]
classifiers List[str]: List of custom classifiers. By default, all AWS classifiers are included in a crawl, but these custom classifiers always override the default classifiers for a given classification.
configuration str: JSON string of configuration information.
description str: Description of the crawler.
dynamodb_targets List[CrawlerDynamodbTarget]: List of nested DynamoDB target arguments. See below.
jdbc_targets List[CrawlerJdbcTarget]: List of nested JBDC target arguments. See below.
name str: Name of the crawler.
s3_targets List[CrawlerS3Target]: List nested Amazon S3 target arguments. See below.
schedule str: A cron expression used to specify the schedule. For more information, see Time-Based Schedules for Jobs and Crawlers. For example, to run something every day at 12:15 UTC, you would specify: cron(15 12 * * ? *).
schema_change_policy Dict[CrawlerSchemaChangePolicy]: Policy for the crawler’s update and deletion behavior.
security_configuration str: The name of Security Configuration to be used by the crawler
table_prefix str: The table prefix used for catalog tables that are created.
tags Dict[str, str]: Key-value map of resource tags

Outputs

All input properties are implicitly available as output properties. Additionally, the Crawler resource produces the following output properties:

Arn string: The ARN of the crawler
Id string: The provider-assigned unique ID for this managed resource.

Arn string: The ARN of the crawler
Id string: The provider-assigned unique ID for this managed resource.

arn string: The ARN of the crawler
id string: The provider-assigned unique ID for this managed resource.

arn str: The ARN of the crawler
id str: The provider-assigned unique ID for this managed resource.

Look up an Existing Crawler Resource

Get an existing Crawler resource’s state with the given name, ID, and optional extra properties used to qualify the lookup.

public static get(name: string, id: Input<ID>, state?: CrawlerState, opts?: CustomResourceOptions): Crawler

static get(resource_name, id, opts=None, arn=None, catalog_targets=None, classifiers=None, configuration=None, database_name=None, description=None, dynamodb_targets=None, jdbc_targets=None, name=None, role=None, s3_targets=None, schedule=None, schema_change_policy=None, security_configuration=None, table_prefix=None, tags=None, __props__=None);

func GetCrawler(ctx *Context, name string, id IDInput, state *CrawlerState, opts ...ResourceOption) (*Crawler, error)

public static Crawler Get(string name, Input<string> id, CrawlerState? state, CustomResourceOptions? opts = null)

name: The unique name of the resulting resource.
id: The unique provider ID of the resource to lookup.
state: Any extra arguments used during the lookup.
opts: A bag of options that control this resource's behavior.

resource_name: The unique name of the resulting resource.
id: The unique provider ID of the resource to lookup.

name: The unique name of the resulting resource.
id: The unique provider ID of the resource to lookup.
state: Any extra arguments used during the lookup.
opts: A bag of options that control this resource's behavior.

name: The unique name of the resulting resource.
id: The unique provider ID of the resource to lookup.
state: Any extra arguments used during the lookup.
opts: A bag of options that control this resource's behavior.

The following state arguments are supported:

Arn string: The ARN of the crawler
CatalogTargets List<CrawlerCatalogTargetArgs>
Classifiers List<string>: List of custom classifiers. By default, all AWS classifiers are included in a crawl, but these custom classifiers always override the default classifiers for a given classification.
Configuration string: JSON string of configuration information.
DatabaseName string: Glue database where results are written.
Description string: Description of the crawler.
DynamodbTargets List<CrawlerDynamodbTargetArgs>: List of nested DynamoDB target arguments. See below.
JdbcTargets List<CrawlerJdbcTargetArgs>: List of nested JBDC target arguments. See below.
Name string: Name of the crawler.
Role string: The IAM role friendly name (including path without leading slash), or ARN of an IAM role, used by the crawler to access other resources.
S3Targets List<CrawlerS3TargetArgs>: List nested Amazon S3 target arguments. See below.
Schedule string: A cron expression used to specify the schedule. For more information, see Time-Based Schedules for Jobs and Crawlers. For example, to run something every day at 12:15 UTC, you would specify: cron(15 12 * * ? *).
SchemaChangePolicy CrawlerSchemaChangePolicyArgs: Policy for the crawler’s update and deletion behavior.
SecurityConfiguration string: The name of Security Configuration to be used by the crawler
TablePrefix string: The table prefix used for catalog tables that are created.
Tags Dictionary<string, string>: Key-value map of resource tags

Arn string: The ARN of the crawler
CatalogTargets []CrawlerCatalogTarget
Classifiers []string: List of custom classifiers. By default, all AWS classifiers are included in a crawl, but these custom classifiers always override the default classifiers for a given classification.
Configuration string: JSON string of configuration information.
DatabaseName string: Glue database where results are written.
Description string: Description of the crawler.
DynamodbTargets []CrawlerDynamodbTarget: List of nested DynamoDB target arguments. See below.
JdbcTargets []CrawlerJdbcTarget: List of nested JBDC target arguments. See below.
Name string: Name of the crawler.
Role string: The IAM role friendly name (including path without leading slash), or ARN of an IAM role, used by the crawler to access other resources.
S3Targets []CrawlerS3Target: List nested Amazon S3 target arguments. See below.
Schedule string: A cron expression used to specify the schedule. For more information, see Time-Based Schedules for Jobs and Crawlers. For example, to run something every day at 12:15 UTC, you would specify: cron(15 12 * * ? *).
SchemaChangePolicy CrawlerSchemaChangePolicy: Policy for the crawler’s update and deletion behavior.
SecurityConfiguration string: The name of Security Configuration to be used by the crawler
TablePrefix string: The table prefix used for catalog tables that are created.
Tags map[string]string: Key-value map of resource tags

arn string: The ARN of the crawler
catalogTargets CrawlerCatalogTarget[]
classifiers string[]: List of custom classifiers. By default, all AWS classifiers are included in a crawl, but these custom classifiers always override the default classifiers for a given classification.
configuration string: JSON string of configuration information.
databaseName string: Glue database where results are written.
description string: Description of the crawler.
dynamodbTargets CrawlerDynamodbTarget[]: List of nested DynamoDB target arguments. See below.
jdbcTargets CrawlerJdbcTarget[]: List of nested JBDC target arguments. See below.
name string: Name of the crawler.
role string: The IAM role friendly name (including path without leading slash), or ARN of an IAM role, used by the crawler to access other resources.
s3Targets CrawlerS3Target[]: List nested Amazon S3 target arguments. See below.
schedule string: A cron expression used to specify the schedule. For more information, see Time-Based Schedules for Jobs and Crawlers. For example, to run something every day at 12:15 UTC, you would specify: cron(15 12 * * ? *).
schemaChangePolicy CrawlerSchemaChangePolicy: Policy for the crawler’s update and deletion behavior.
securityConfiguration string: The name of Security Configuration to be used by the crawler
tablePrefix string: The table prefix used for catalog tables that are created.
tags {[key: string]: string}: Key-value map of resource tags

arn str: The ARN of the crawler
catalog_targets List[CrawlerCatalogTarget]
classifiers List[str]: List of custom classifiers. By default, all AWS classifiers are included in a crawl, but these custom classifiers always override the default classifiers for a given classification.
configuration str: JSON string of configuration information.
database_name str: Glue database where results are written.
description str: Description of the crawler.
dynamodb_targets List[CrawlerDynamodbTarget]: List of nested DynamoDB target arguments. See below.
jdbc_targets List[CrawlerJdbcTarget]: List of nested JBDC target arguments. See below.
name str: Name of the crawler.
role str: The IAM role friendly name (including path without leading slash), or ARN of an IAM role, used by the crawler to access other resources.
s3_targets List[CrawlerS3Target]: List nested Amazon S3 target arguments. See below.
schedule str: A cron expression used to specify the schedule. For more information, see Time-Based Schedules for Jobs and Crawlers. For example, to run something every day at 12:15 UTC, you would specify: cron(15 12 * * ? *).
schema_change_policy Dict[CrawlerSchemaChangePolicy]: Policy for the crawler’s update and deletion behavior.
security_configuration str: The name of Security Configuration to be used by the crawler
table_prefix str: The table prefix used for catalog tables that are created.
tags Dict[str, str]: Key-value map of resource tags

Supporting Types

CrawlerCatalogTarget

See the input and output API doc for this type.

DatabaseName string: The name of the Glue database to be synchronized.
Tables List<string>: A list of catalog tables to be synchronized.

DatabaseName string: The name of the Glue database to be synchronized.
Tables []string: A list of catalog tables to be synchronized.

databaseName string: The name of the Glue database to be synchronized.
tables string[]: A list of catalog tables to be synchronized.

database_name str: The name of the Glue database to be synchronized.
tables List[str]: A list of catalog tables to be synchronized.

CrawlerDynamodbTarget

See the input and output API doc for this type.

Path string: The name of the DynamoDB table to crawl.

Path string: The name of the DynamoDB table to crawl.

path string: The name of the DynamoDB table to crawl.

path str: The name of the DynamoDB table to crawl.

CrawlerJdbcTarget

See the input and output API doc for this type.

ConnectionName string: The name of the connection to use to connect to the JDBC target.
Path string: The path of the JDBC target.
Exclusions List<string>: A list of glob patterns used to exclude from the crawl.

ConnectionName string: The name of the connection to use to connect to the JDBC target.
Path string: The path of the JDBC target.
Exclusions []string: A list of glob patterns used to exclude from the crawl.

connectionName string: The name of the connection to use to connect to the JDBC target.
path string: The path of the JDBC target.
exclusions string[]: A list of glob patterns used to exclude from the crawl.

connectionName str: The name of the connection to use to connect to the JDBC target.
path str: The path of the JDBC target.
exclusions List[str]: A list of glob patterns used to exclude from the crawl.

CrawlerS3Target

See the input and output API doc for this type.

Path string: The name of the DynamoDB table to crawl.
Exclusions List<string>: A list of glob patterns used to exclude from the crawl.

Path string: The name of the DynamoDB table to crawl.
Exclusions []string: A list of glob patterns used to exclude from the crawl.

path string: The name of the DynamoDB table to crawl.
exclusions string[]: A list of glob patterns used to exclude from the crawl.

path str: The name of the DynamoDB table to crawl.
exclusions List[str]: A list of glob patterns used to exclude from the crawl.

CrawlerSchemaChangePolicy

See the input and output API doc for this type.

DeleteBehavior string: The deletion behavior when the crawler finds a deleted object. Valid values: LOG, DELETE_FROM_DATABASE, or DEPRECATE_IN_DATABASE. Defaults to DEPRECATE_IN_DATABASE.
UpdateBehavior string: The update behavior when the crawler finds a changed schema. Valid values: LOG or UPDATE_IN_DATABASE. Defaults to UPDATE_IN_DATABASE.

DeleteBehavior string: The deletion behavior when the crawler finds a deleted object. Valid values: LOG, DELETE_FROM_DATABASE, or DEPRECATE_IN_DATABASE. Defaults to DEPRECATE_IN_DATABASE.
UpdateBehavior string: The update behavior when the crawler finds a changed schema. Valid values: LOG or UPDATE_IN_DATABASE. Defaults to UPDATE_IN_DATABASE.

deleteBehavior string: The deletion behavior when the crawler finds a deleted object. Valid values: LOG, DELETE_FROM_DATABASE, or DEPRECATE_IN_DATABASE. Defaults to DEPRECATE_IN_DATABASE.
updateBehavior string: The update behavior when the crawler finds a changed schema. Valid values: LOG or UPDATE_IN_DATABASE. Defaults to UPDATE_IN_DATABASE.

deleteBehavior str: The deletion behavior when the crawler finds a deleted object. Valid values: LOG, DELETE_FROM_DATABASE, or DEPRECATE_IN_DATABASE. Defaults to DEPRECATE_IN_DATABASE.
updateBehavior str: The update behavior when the crawler finds a changed schema. Valid values: LOG or UPDATE_IN_DATABASE. Defaults to UPDATE_IN_DATABASE.

Package Details

Repository: https://github.com/pulumi/pulumi-aws
License: Apache-2.0
Notes: This Pulumi package is based on the aws Terraform Provider.

The Pulumi Platform

Get Started

Migrate to Pulumi

Solutions for All Teams and Engineers

Any Code

Any Cloud

Crawler

Example Usage

DynamoDB Target

JDBC Target

S3 Target

Create a Crawler Resource

Crawler Resource Properties

Inputs

Outputs

Look up an Existing Crawler Resource

Supporting Types

CrawlerCatalogTarget

CrawlerDynamodbTarget

CrawlerJdbcTarget

CrawlerS3Target

CrawlerSchemaChangePolicy

Package Details

On This Page