Extraction and AI

The unified scrape API returns more than raw HTML. Get structured fields, cleaned content, metadata, and LLM-friendly outputs in a single request.

Output modes

Output Best for
html Raw parsing and archival
markdown LLM and RAG pipelines
text NLP and search indexing
clean Article-like readable output

Article extraction

Extract the main article content with links and metadata:

curl -X POST "https://scrape.toolkitapi.io/v1/scrape" \
  -H "X-API-Key: YOUR_KEY" \
  -H "Content-Type: application/json" \
  -d '{"url": "https://toolkitapi.io/blog/post", "formats": ["clean"], "include_links": true}'
import requests

resp = requests.post(
    "https://scrape.toolkitapi.io/v1/scrape",
    headers={"X-API-Key": "YOUR_KEY"},
    json={"url": "https://toolkitapi.io/blog/post", "formats": ["clean"], "include_links": True},
)
data = resp.json()
print(data.get("article", data.get("clean", ""))[:500])
const resp = await fetch("https://scrape.toolkitapi.io/v1/scrape", {
  method: "POST",
  headers: { "X-API-Key": "YOUR_KEY", "Content-Type": "application/json" },
  body: JSON.stringify({ url: "https://toolkitapi.io/blog/post", formats: ["clean"], include_links: true }),
});
const data = await resp.json();
console.log(data.clean);

Python SDK

from toolkitapi import Scrape

with Scrape(api_key="tk_...") as scrape:
    result = scrape.extract_article(
        url="https://toolkitapi.io/blog/post",
        include_links=True,
    )
    print(result.get("article"))

CSS selector extraction

Extract specific elements using CSS selectors:

curl -X POST "https://scrape.toolkitapi.io/v1/scrape" \
  -H "X-API-Key: YOUR_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://toolkitapi.io/product/123",
    "render_js": true,
    "extract": {
      "selectors": {
        "title": "h1",
        "price": ".price",
        "description": {"selector": ".description", "attr": "text"},
        "image_urls": {"selector": ".gallery img", "attr": "src", "multiple": true}
      }
    }
  }'
const resp = await fetch("https://scrape.toolkitapi.io/v1/scrape", {
  method: "POST",
  headers: { "X-API-Key": "YOUR_KEY", "Content-Type": "application/json" },
  body: JSON.stringify({
    url: "https://toolkitapi.io/product/123",
    render_js: true,
    extract: {
      selectors: {
        title: "h1",
        price: ".price",
        image_urls: { selector: ".gallery img", attr: "src", multiple: true },
      },
    },
  }),
});
const data = await resp.json();
console.log(data.selectors.title);
Response
{
  "selectors": {
    "title": "Widget Pro",
    "price": "$49.99",
    "image_urls": ["https://toolkitapi.io/img/widget-1.jpg", "https://toolkitapi.io/img/widget-2.jpg"]
  }
}
curl -X POST "https://scrape.toolkitapi.io/v1/scrape" \
  -H "X-API-Key: YOUR_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://toolkitapi.io",
    "formats": ["markdown"],
    "extract": {
      "meta_tags": true,
      "link_preview": true,
      "links": true,
      "images": true
    }
  }'
from toolkitapi import Scrape

with Scrape(api_key="tk_...") as scrape:
    result = scrape.fetch(
        url="https://toolkitapi.io",
        output="markdown",
        extract={"meta_tags": True, "link_preview": True, "links": True, "images": True},
    )
    print(result.get("meta_tags"))
    print(result.get("link_preview"))
    print(f"{len(result.get('links', []))} links found")
    print(f"{len(result.get('images', []))} images found")
Response
{
  "meta_tags": {
    "title": "Toolkit API",
    "description": "Developer toolbox API — DNS, scraping, image processing, and more.",
    "og:title": "Toolkit API",
    "og:image": "https://toolkitapi.io/og.png"
  },
  "link_preview": {
    "title": "Toolkit API",
    "description": "Developer toolbox API...",
    "image": "https://toolkitapi.io/og.png"
  },
  "links": [
    {"url": "https://toolkitapi.io/docs", "text": "Documentation"},
    {"url": "https://toolkitapi.io/pricing", "text": "Pricing"}
  ],
  "images": [
    {"src": "https://toolkitapi.io/hero.png", "alt": "Toolkit API dashboard"}
  ]
}

AI extraction

Extract structured data from any page using natural language and JSON Schema:

curl -X POST "https://scrape.toolkitapi.io/v1/scrape" \
  -H "X-API-Key: YOUR_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://toolkitapi.io/product/123",
    "render_js": true,
    "extract": {
      "ai_prompt": "Extract the product name, price, availability, and any sizes/colors.",
      "ai_schema": {
        "type": "object",
        "properties": {
          "name": {"type": "string"},
          "price": {"type": "string"},
          "in_stock": {"type": "boolean"},
          "variants": {"type": "array", "items": {"type": "string"}}
        }
      }
    }
  }'
from toolkitapi import Scrape

with Scrape(api_key="tk_...") as scrape:
    result = scrape.ai_extract(
        url="https://toolkitapi.io/product/123",
        render_js=True,
        prompt="Extract the product name, price, availability, and any sizes/colors.",
        schema={
            "type": "object",
            "properties": {
                "name": {"type": "string"},
                "price": {"type": "string"},
                "in_stock": {"type": "boolean"},
                "variants": {"type": "array", "items": {"type": "string"}},
            },
        },
    )
    print(result.get("ai_extract"))
Response
{
  "ai_extract": {
    "name": "Widget Pro",
    "price": "$49.99",
    "in_stock": true,
    "variants": ["Small", "Medium", "Large", "Blue", "Red", "Black"]
  }
}

Extraction parameters

Parameter Type Description
extract.selectors object CSS selectors: {"key": "selector"} or {"key": {"selector": "...", "attr": "src", "multiple": true}}
extract.meta_tags boolean Extract all meta tags and OG/Twitter cards
extract.link_preview boolean Generate a rich link preview object
extract.links boolean Extract all hyperlinks from the page
extract.images boolean Extract all image URLs with alt text
extract.ai_prompt string Natural language description of what to extract
extract.ai_schema object JSON Schema defining the desired output structure
include_links boolean Include hyperlink data in markdown/text output
include_tables boolean Extract tables as structured data

Tip

AI extraction works best with render_js: true for dynamic pages. The schema defines both what to extract and the return type, so use "type": "boolean" for yes/no fields rather than strings.