Extraction and AI¶
The unified scrape API returns more than raw HTML. Get structured fields, cleaned content, metadata, and LLM-friendly outputs in a single request.
Output modes¶
| Output | Best for |
|---|---|
html |
Raw parsing and archival |
markdown |
LLM and RAG pipelines |
text |
NLP and search indexing |
clean |
Article-like readable output |
Article extraction¶
Extract the main article content with links and metadata:
curl -X POST "https://scrape.toolkitapi.io/v1/scrape" \
-H "X-API-Key: YOUR_KEY" \
-H "Content-Type: application/json" \
-d '{"url": "https://toolkitapi.io/blog/post", "formats": ["clean"], "include_links": true}'
import requests
resp = requests.post(
"https://scrape.toolkitapi.io/v1/scrape",
headers={"X-API-Key": "YOUR_KEY"},
json={"url": "https://toolkitapi.io/blog/post", "formats": ["clean"], "include_links": True},
)
data = resp.json()
print(data.get("article", data.get("clean", ""))[:500])
const resp = await fetch("https://scrape.toolkitapi.io/v1/scrape", {
method: "POST",
headers: { "X-API-Key": "YOUR_KEY", "Content-Type": "application/json" },
body: JSON.stringify({ url: "https://toolkitapi.io/blog/post", formats: ["clean"], include_links: true }),
});
const data = await resp.json();
console.log(data.clean);
Python SDK¶
from toolkitapi import Scrape
with Scrape(api_key="tk_...") as scrape:
result = scrape.extract_article(
url="https://toolkitapi.io/blog/post",
include_links=True,
)
print(result.get("article"))
CSS selector extraction¶
Extract specific elements using CSS selectors:
curl -X POST "https://scrape.toolkitapi.io/v1/scrape" \
-H "X-API-Key: YOUR_KEY" \
-H "Content-Type: application/json" \
-d '{
"url": "https://toolkitapi.io/product/123",
"render_js": true,
"extract": {
"selectors": {
"title": "h1",
"price": ".price",
"description": {"selector": ".description", "attr": "text"},
"image_urls": {"selector": ".gallery img", "attr": "src", "multiple": true}
}
}
}'
const resp = await fetch("https://scrape.toolkitapi.io/v1/scrape", {
method: "POST",
headers: { "X-API-Key": "YOUR_KEY", "Content-Type": "application/json" },
body: JSON.stringify({
url: "https://toolkitapi.io/product/123",
render_js: true,
extract: {
selectors: {
title: "h1",
price: ".price",
image_urls: { selector: ".gallery img", attr: "src", multiple: true },
},
},
}),
});
const data = await resp.json();
console.log(data.selectors.title);
Response
{
"selectors": {
"title": "Widget Pro",
"price": "$49.99",
"image_urls": ["https://toolkitapi.io/img/widget-1.jpg", "https://toolkitapi.io/img/widget-2.jpg"]
}
}
Extract links, images, and metadata¶
curl -X POST "https://scrape.toolkitapi.io/v1/scrape" \
-H "X-API-Key: YOUR_KEY" \
-H "Content-Type: application/json" \
-d '{
"url": "https://toolkitapi.io",
"formats": ["markdown"],
"extract": {
"meta_tags": true,
"link_preview": true,
"links": true,
"images": true
}
}'
from toolkitapi import Scrape
with Scrape(api_key="tk_...") as scrape:
result = scrape.fetch(
url="https://toolkitapi.io",
output="markdown",
extract={"meta_tags": True, "link_preview": True, "links": True, "images": True},
)
print(result.get("meta_tags"))
print(result.get("link_preview"))
print(f"{len(result.get('links', []))} links found")
print(f"{len(result.get('images', []))} images found")
Response
{
"meta_tags": {
"title": "Toolkit API",
"description": "Developer toolbox API — DNS, scraping, image processing, and more.",
"og:title": "Toolkit API",
"og:image": "https://toolkitapi.io/og.png"
},
"link_preview": {
"title": "Toolkit API",
"description": "Developer toolbox API...",
"image": "https://toolkitapi.io/og.png"
},
"links": [
{"url": "https://toolkitapi.io/docs", "text": "Documentation"},
{"url": "https://toolkitapi.io/pricing", "text": "Pricing"}
],
"images": [
{"src": "https://toolkitapi.io/hero.png", "alt": "Toolkit API dashboard"}
]
}
AI extraction¶
Extract structured data from any page using natural language and JSON Schema:
curl -X POST "https://scrape.toolkitapi.io/v1/scrape" \
-H "X-API-Key: YOUR_KEY" \
-H "Content-Type: application/json" \
-d '{
"url": "https://toolkitapi.io/product/123",
"render_js": true,
"extract": {
"ai_prompt": "Extract the product name, price, availability, and any sizes/colors.",
"ai_schema": {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "string"},
"in_stock": {"type": "boolean"},
"variants": {"type": "array", "items": {"type": "string"}}
}
}
}
}'
from toolkitapi import Scrape
with Scrape(api_key="tk_...") as scrape:
result = scrape.ai_extract(
url="https://toolkitapi.io/product/123",
render_js=True,
prompt="Extract the product name, price, availability, and any sizes/colors.",
schema={
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "string"},
"in_stock": {"type": "boolean"},
"variants": {"type": "array", "items": {"type": "string"}},
},
},
)
print(result.get("ai_extract"))
Response
{
"ai_extract": {
"name": "Widget Pro",
"price": "$49.99",
"in_stock": true,
"variants": ["Small", "Medium", "Large", "Blue", "Red", "Black"]
}
}
Extraction parameters¶
| Parameter | Type | Description |
|---|---|---|
extract.selectors |
object | CSS selectors: {"key": "selector"} or {"key": {"selector": "...", "attr": "src", "multiple": true}} |
extract.meta_tags |
boolean | Extract all meta tags and OG/Twitter cards |
extract.link_preview |
boolean | Generate a rich link preview object |
extract.links |
boolean | Extract all hyperlinks from the page |
extract.images |
boolean | Extract all image URLs with alt text |
extract.ai_prompt |
string | Natural language description of what to extract |
extract.ai_schema |
object | JSON Schema defining the desired output structure |
include_links |
boolean | Include hyperlink data in markdown/text output |
include_tables |
boolean | Extract tables as structured data |
Tip
AI extraction works best with render_js: true for dynamic pages. The schema defines both what to extract and the return type, so use "type": "boolean" for yes/no fields rather than strings.