Extract and Download Images
Find all images on a page and download them to disk using a headless browser that can access JavaScript-rendered content.
- A Browserless API token from your account dashboard
Steps
- REST API
- Frameworks
Use the /scrape REST endpoint to extract image URLs without opening a browser connection.
- cURL
- JavaScript
- Python
- Java
- C#
- Go
- PHP
- Ruby
1. Build the request
Use the /scrape endpoint to extract all <img> src attributes from the page:
https://production-sfo.browserless.io/scrape?token=YOUR_API_TOKEN_HERE
2. Extract URLs and download each image
The src is nested inside each result's attributes array. Use jq to pull it out, then loop and download:
URLS=$(curl -s -X POST \
"https://production-sfo.browserless.io/scrape?token=YOUR_API_TOKEN_HERE" \
-H "Content-Type: application/json" \
-d '{
"url": "https://example.com",
"elements": [{ "selector": "img", "timeout": 5000 }]
}' | jq -r '.data[0].results[].attributes[] | select(.name=="src") | .value')
mkdir -p images
i=0
while IFS= read -r url; do
curl -sL "$url" --output "images/image-$i.jpg"
echo "Saved images/image-$i.jpg"
i=$((i + 1))
done <<< "$URLS"
1. Scrape image URLs and download each one
import fs from 'fs';
import path from 'path';
const TOKEN = 'YOUR_API_TOKEN_HERE';
const scrapeRes = await fetch(
`https://production-sfo.browserless.io/scrape?token=${TOKEN}`,
{
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
url: 'https://en.wikipedia.org/wiki/Main_Page',
elements: [{ selector: 'img' }],
}),
}
);
const { data } = await scrapeRes.json();
// src is nested inside each result's attributes array.
const imageUrls = data[0].results
.flatMap((r) => r.attributes)
.filter((a) => a.name === 'src' && a.value.startsWith('http'))
.map((a) => a.value);
console.log(`Found ${imageUrls.length} images`);
fs.mkdirSync('images', { recursive: true });
for (const [i, url] of imageUrls.entries()) {
const res = await fetch(url);
const buf = Buffer.from(await res.arrayBuffer());
const ext = path.extname(new URL(url).pathname) || '.jpg';
fs.writeFileSync(`images/image-${i}${ext}`, buf);
console.log(`Saved image-${i}${ext}`);
}
2. Check the output
Run with node images.mjs. All images are saved into an images/ directory.
1. Install dependencies
pip install requests
2. Scrape image URLs and download each one
import os
import requests
from urllib.parse import urlparse
TOKEN = 'YOUR_API_TOKEN_HERE'
scrape_res = requests.post(
f'https://production-sfo.browserless.io/scrape?token={TOKEN}',
json={
'url': 'https://en.wikipedia.org/wiki/Main_Page',
'elements': [{'selector': 'img'}],
},
)
data = scrape_res.json()['data']
# src is nested inside each result's attributes array.
image_urls = [
attr['value']
for result in data[0]['results']
for attr in result['attributes']
if attr['name'] == 'src' and attr['value'].startswith('http')
]
print(f'Found {len(image_urls)} images')
os.makedirs('images', exist_ok=True)
for i, url in enumerate(image_urls):
res = requests.get(url)
ext = os.path.splitext(urlparse(url).path)[1] or '.jpg'
filename = f'images/image-{i}{ext}'
with open(filename, 'wb') as f:
f.write(res.content)
print(f'Saved {filename}')
3. Check the output
Run with python images.py. All images are saved into an images/ directory.
1. Install dependencies
Add Gson to your pom.xml for JSON parsing (java.net.http.HttpClient ships with the JDK):
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.10.1</version>
</dependency>
2. Scrape image URLs and download each one
import com.google.gson.*;
import java.net.URI;
import java.net.http.*;
import java.nio.file.*;
public class DownloadImages {
public static void main(String[] args) throws Exception {
String token = "YOUR_API_TOKEN_HERE";
HttpClient client = HttpClient.newHttpClient();
String body = "{\"url\":\"https://example.com\",\"elements\":[{\"selector\":\"img\",\"timeout\":5000}]}";
HttpRequest scrapeReq = HttpRequest.newBuilder()
.uri(URI.create("https://production-sfo.browserless.io/scrape?token=" + token))
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(body))
.build();
HttpResponse<String> scrapeRes = client.send(scrapeReq, HttpResponse.BodyHandlers.ofString());
JsonObject json = JsonParser.parseString(scrapeRes.body()).getAsJsonObject();
JsonArray results = json.getAsJsonArray("data").get(0)
.getAsJsonObject().getAsJsonArray("results");
Files.createDirectories(Path.of("images"));
int i = 0;
for (JsonElement resultEl : results) {
for (JsonElement attrEl : resultEl.getAsJsonObject().getAsJsonArray("attributes")) {
JsonObject attr = attrEl.getAsJsonObject();
String name = attr.get("name").getAsString();
String value = attr.get("value").getAsString();
if ("src".equals(name) && value.startsWith("http")) {
HttpRequest imgReq = HttpRequest.newBuilder()
.uri(URI.create(value)).build();
HttpResponse<byte[]> imgRes = client.send(imgReq, HttpResponse.BodyHandlers.ofByteArray());
// Use the URL's file extension; fall back to .jpg for extensionless paths.
// Use URI.getPath() to strip query strings before extracting the extension.
String path = URI.create(value).getPath();
int dot = path.lastIndexOf('.');
String ext = dot >= 0 ? path.substring(dot) : ".jpg";
Files.write(Path.of("images/image-" + i + ext), imgRes.body());
System.out.println("Saved images/image-" + i + ext);
i++;
}
}
}
}
}
3. Check the output
Compile with javac and run with java DownloadImages. All images are saved into an images/ directory.
1. Dependencies
System.Net.Http.HttpClient and System.Text.Json are part of the .NET standard library. No packages needed.
2. Scrape image URLs and download each one
using System;
using System.IO;
using System.Net.Http;
using System.Text;
using System.Text.Json;
using System.Threading.Tasks;
class DownloadImages
{
static async Task Main()
{
const string token = "YOUR_API_TOKEN_HERE";
using var client = new HttpClient();
var body = new StringContent(
"{\"url\":\"https://example.com\",\"elements\":[{\"selector\":\"img\",\"timeout\":5000}]}",
Encoding.UTF8,
"application/json"
);
var scrapeRes = await client.PostAsync(
$"https://production-sfo.browserless.io/scrape?token={token}",
body
);
using var json = JsonDocument.Parse(await scrapeRes.Content.ReadAsStringAsync());
var results = json.RootElement.GetProperty("data")[0].GetProperty("results");
Directory.CreateDirectory("images");
int i = 0;
foreach (var result in results.EnumerateArray())
{
foreach (var attr in result.GetProperty("attributes").EnumerateArray())
{
string name = attr.GetProperty("name").GetString();
string value = attr.GetProperty("value").GetString();
if (name == "src" && value.StartsWith("http"))
{
var bytes = await client.GetByteArrayAsync(value);
// Use the URL's file extension; fall back to .jpg for extensionless paths.
string ext = Path.GetExtension(new Uri(value).AbsolutePath) is { Length: > 0 } e ? e : ".jpg";
string filename = $"images/image-{i}{ext}";
await File.WriteAllBytesAsync(filename, bytes);
Console.WriteLine($"Saved {filename}");
i++;
}
}
}
}
}
3. Check the output
Run with dotnet run. All images are saved into an images/ directory.
1. Dependencies
encoding/json and net/http are part of Go's standard library. No extra packages needed.
2. Scrape image URLs and download each one
package main
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"strings"
)
type ScrapeResponse struct {
Data []struct {
Results []struct {
Attributes []struct {
Name string `json:"name"`
Value string `json:"value"`
} `json:"attributes"`
} `json:"results"`
} `json:"data"`
}
func main() {
token := "YOUR_API_TOKEN_HERE"
body := `{"url":"https://example.com","elements":[{"selector":"img","timeout":5000}]}`
req, _ := http.NewRequest("POST",
"https://production-sfo.browserless.io/scrape?token="+token,
bytes.NewBufferString(body),
)
req.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(req)
if err != nil {
panic(err)
}
defer resp.Body.Close()
var scrape ScrapeResponse
json.NewDecoder(resp.Body).Decode(&scrape)
os.MkdirAll("images", 0755)
i := 0
for _, result := range scrape.Data[0].Results {
for _, attr := range result.Attributes {
if attr.Name == "src" && strings.HasPrefix(attr.Value, "http") {
imgResp, err := http.Get(attr.Value)
if err != nil {
continue
}
data, _ := io.ReadAll(imgResp.Body)
imgResp.Body.Close()
// Strip query strings before extracting the extension.
ext := filepath.Ext(strings.Split(attr.Value, "?")[0])
if ext == "" {
ext = ".jpg"
}
filename := fmt.Sprintf("images/image-%d%s", i, ext)
os.WriteFile(filename, data, 0644)
fmt.Println("Saved", filename)
i++
}
}
}
}
3. Check the output
Run with go run main.go. All images are saved into an images/ directory.
1. Dependencies
This example uses PHP's built-in curl and file_get_contents. No Composer packages needed.
2. Scrape image URLs and download each one
<?php
$token = 'YOUR_API_TOKEN_HERE';
$ch = curl_init('https://production-sfo.browserless.io/scrape?token=' . $token);
curl_setopt_array($ch, [
CURLOPT_POST => true,
CURLOPT_HTTPHEADER => ['Content-Type: application/json'],
CURLOPT_POSTFIELDS => json_encode([
'url' => 'https://example.com',
'elements' => [['selector' => 'img', 'timeout' => 5000]],
]),
CURLOPT_RETURNTRANSFER => true,
]);
$body = curl_exec($ch);
curl_close($ch);
$data = json_decode($body, true);
// src is nested inside each result's attributes array.
$imageUrls = [];
foreach ($data['data'][0]['results'] as $result) {
foreach ($result['attributes'] as $attr) {
if ($attr['name'] === 'src' && str_starts_with($attr['value'], 'http')) {
$imageUrls[] = $attr['value'];
}
}
}
echo 'Found ' . count($imageUrls) . " images\n";
mkdir('images', 0755, true);
foreach ($imageUrls as $i => $url) {
$imageData = file_get_contents($url);
$ext = pathinfo(parse_url($url, PHP_URL_PATH), PATHINFO_EXTENSION) ?: 'jpg';
$filename = "images/image-{$i}.{$ext}";
file_put_contents($filename, $imageData);
echo "Saved {$filename}\n";
}
3. Check the output
Run with php images.php. All images are saved into an images/ directory.
1. Dependencies
net/http, json, and fileutils are part of Ruby's standard library. No gems required.
2. Scrape image URLs and download each one
require 'net/http'
require 'json'
require 'uri'
require 'fileutils'
TOKEN = 'YOUR_API_TOKEN_HERE'
scrape_uri = URI("https://production-sfo.browserless.io/scrape?token=#{TOKEN}")
http = Net::HTTP.new(scrape_uri.host, scrape_uri.port)
http.use_ssl = true
request = Net::HTTP::Post.new(scrape_uri)
request['Content-Type'] = 'application/json'
request.body = JSON.generate({
url: 'https://example.com',
elements: [{ selector: 'img', timeout: 5000 }]
})
response = http.request(request)
data = JSON.parse(response.body)
# src is nested inside each result's attributes array.
image_urls = data['data'][0]['results']
.flat_map { |r| r['attributes'] }
.select { |a| a['name'] == 'src' && a['value'].start_with?('http') }
.map { |a| a['value'] }
puts "Found #{image_urls.size} images"
FileUtils.mkdir_p('images')
image_urls.each_with_index do |url, i|
image_data = Net::HTTP.get(URI(url))
ext = File.extname(URI(url).path)
ext = '.jpg' if ext.empty?
filename = "images/image-#{i}#{ext}"
File.binwrite(filename, image_data)
puts "Saved #{filename}"
end
3. Check the output
Run with ruby images.rb. All images are saved into an images/ directory.
Use a browser connection to evaluate the fully rendered DOM directly. This is useful when images are loaded lazily or injected by JavaScript.
- Puppeteer
- Playwright
- Go (chromedp)
1. Install dependencies
npm install puppeteer-core
2. Connect, extract, and download
import fs from 'fs';
import path from 'path';
import puppeteer from 'puppeteer-core';
const browser = await puppeteer.connect({
browserWSEndpoint: 'wss://production-sfo.browserless.io?token=YOUR_API_TOKEN_HERE',
});
try {
const page = await browser.newPage();
await page.goto('https://example.com', { waitUntil: 'networkidle2' });
const imageUrls = await page.evaluate(() =>
Array.from(document.querySelectorAll('img'))
.map((img) => img.src)
.filter((src) => src.startsWith('http'))
);
console.log(`Found ${imageUrls.length} images`);
fs.mkdirSync('images', { recursive: true });
for (const [i, url] of imageUrls.entries()) {
const res = await fetch(url);
const buf = Buffer.from(await res.arrayBuffer());
const ext = path.extname(new URL(url).pathname) || '.jpg';
fs.writeFileSync(`images/image-${i}${ext}`, buf);
console.log(`Saved image-${i}${ext}`);
}
} finally {
// Always close to release the session even on error.
await browser.close();
}
3. Check the output
Run with node images.mjs. All images are saved into an images/ directory.
- JavaScript
- Python
- Java
- C#
1. Install dependencies
npm install playwright-core
2. Connect, extract, and download
import fs from 'fs';
import path from 'path';
import { chromium } from 'playwright-core';
const browser = await chromium.connectOverCDP(
'wss://production-sfo.browserless.io?token=YOUR_API_TOKEN_HERE'
);
try {
// Use the default context — browser.newPage() creates a new context that
// doesn't inherit proxy, profile, or launch settings.
const context = browser.contexts()[0];
const page = await context.newPage();
await page.goto('https://example.com', { waitUntil: 'networkidle' });
const imageUrls = await page.evaluate(() =>
Array.from(document.querySelectorAll('img'))
.map((img) => img.src)
.filter((src) => src.startsWith('http'))
);
console.log(`Found ${imageUrls.length} images`);
fs.mkdirSync('images', { recursive: true });
for (const [i, url] of imageUrls.entries()) {
const res = await fetch(url);
const buf = Buffer.from(await res.arrayBuffer());
const ext = path.extname(new URL(url).pathname) || '.jpg';
fs.writeFileSync(`images/image-${i}${ext}`, buf);
console.log(`Saved image-${i}${ext}`);
}
} finally {
// Always close to release the session even on error.
await browser.close();
}
3. Check the output
Run with node images.mjs. All images are saved into an images/ directory.
1. Install dependencies
pip install requests playwright
playwright install chromium
2. Connect, extract, and download
import os
import requests
from urllib.parse import urlparse
from playwright.sync_api import sync_playwright
TOKEN = 'YOUR_API_TOKEN_HERE'
WS_ENDPOINT = f'wss://production-sfo.browserless.io?token={TOKEN}'
with sync_playwright() as playwright:
browser = playwright.chromium.connect_over_cdp(WS_ENDPOINT)
try:
# Use the default context — browser.new_page() creates a new context that
# doesn't inherit proxy, profile, or launch settings.
context = browser.contexts[0]
page = context.new_page()
page.goto('https://example.com')
page.wait_for_load_state('networkidle')
image_urls = page.evaluate("""() =>
Array.from(document.querySelectorAll('img'))
.map(img => img.src)
.filter(src => src.startsWith('http'))
""")
print(f'Found {len(image_urls)} images')
os.makedirs('images', exist_ok=True)
for i, url in enumerate(image_urls):
res = requests.get(url)
ext = os.path.splitext(urlparse(url).path)[1] or '.jpg'
filename = f'images/image-{i}{ext}'
with open(filename, 'wb') as f:
f.write(res.content)
print(f'Saved {filename}')
finally:
# Always close to release the session even on error.
browser.close()
3. Check the output
Run with python images.py. All images are saved into an images/ directory.
1. Install dependencies
Add the Playwright dependency to your pom.xml:
<dependency>
<groupId>com.microsoft.playwright</groupId>
<artifactId>playwright</artifactId>
<version>1.44.0</version>
</dependency>
2. Connect, extract, and download
import com.microsoft.playwright.*;
import java.net.URI;
import java.net.http.*;
import java.nio.file.*;
import java.util.List;
public class DownloadImages {
public static void main(String[] args) throws Exception {
String TOKEN = "YOUR_API_TOKEN_HERE";
String WS_ENDPOINT = "wss://production-sfo.browserless.io?token=" + TOKEN;
try (Playwright playwright = Playwright.create()) {
Browser browser = playwright.chromium().connectOverCDP(WS_ENDPOINT);
try {
// Use the default context — creating a new one doesn't inherit launch settings.
BrowserContext context = browser.contexts().get(0);
Page page = context.newPage();
page.navigate("https://example.com");
page.waitForLoadState(LoadState.NETWORKIDLE);
@SuppressWarnings("unchecked")
List<String> imageUrls = (List<String>) page.evaluate("""
Array.from(document.querySelectorAll('img'))
.map(img => img.src)
.filter(src => src.startsWith('http'))
""");
System.out.println("Found " + imageUrls.size() + " images");
Files.createDirectories(Path.of("images"));
HttpClient client = HttpClient.newHttpClient();
for (int i = 0; i < imageUrls.size(); i++) {
String url = imageUrls.get(i);
HttpRequest req = HttpRequest.newBuilder().uri(URI.create(url)).build();
HttpResponse<byte[]> res = client.send(req, HttpResponse.BodyHandlers.ofByteArray());
// Use the URL's file extension; fall back to .jpg for extensionless paths.
// Use URI.getPath() to strip query strings before extracting the extension.
String path = URI.create(url).getPath();
int dot = path.lastIndexOf('.');
String ext = dot >= 0 ? path.substring(dot) : ".jpg";
Path dest = Path.of("images/image-" + i + ext);
Files.write(dest, res.body());
System.out.println("Saved " + dest);
}
} finally {
// Always close to release the session even on error.
browser.close();
}
}
}
}
3. Check the output
Compile with mvn compile and run with mvn exec:java. All images are saved into an images/ directory.
1. Install dependencies
dotnet add package Microsoft.Playwright
playwright install chromium
2. Connect, extract, and download
using Microsoft.Playwright;
using System;
using System.IO;
using System.Net.Http;
using System.Threading.Tasks;
class DownloadImages
{
static async Task Main()
{
const string TOKEN = "YOUR_API_TOKEN_HERE";
string WS_ENDPOINT = $"wss://production-sfo.browserless.io?token={TOKEN}";
using var playwright = await Playwright.CreateAsync();
var browser = await playwright.Chromium.ConnectOverCDPAsync(WS_ENDPOINT);
try
{
// Use the default context — creating a new one doesn't inherit launch settings.
var context = browser.Contexts[0];
var page = await context.NewPageAsync();
await page.GotoAsync("https://example.com",
new() { WaitUntil = WaitUntilState.NetworkIdle });
var imageUrls = await page.EvaluateAsync<string[]>("""
Array.from(document.querySelectorAll('img'))
.map(img => img.src)
.filter(src => src.startsWith('http'))
""");
Console.WriteLine($"Found {imageUrls.Length} images");
Directory.CreateDirectory("images");
using var client = new HttpClient();
for (int i = 0; i < imageUrls.Length; i++)
{
var bytes = await client.GetByteArrayAsync(imageUrls[i]);
// Use the URL's file extension; fall back to .jpg for extensionless paths.
string ext = Path.GetExtension(new Uri(imageUrls[i]).AbsolutePath) is { Length: > 0 } e ? e : ".jpg";
string filename = $"images/image-{i}{ext}";
await File.WriteAllBytesAsync(filename, bytes);
Console.WriteLine($"Saved {filename}");
}
}
finally
{
// Always close to release the session even on error.
await browser.CloseAsync();
}
}
}
3. Check the output
Run with dotnet run. All images are saved into an images/ directory.
1. Install dependencies
go get github.com/chromedp/chromedp
2. Connect, extract, and download
package main
import (
"context"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"strings"
"github.com/chromedp/chromedp"
)
func main() {
token := "YOUR_API_TOKEN_HERE"
ws := fmt.Sprintf("wss://production-sfo.browserless.io?token=%s", token)
allocCtx, cancel := chromedp.NewRemoteAllocator(context.Background(), ws, chromedp.NoModifyURL)
defer cancel()
ctx, cancel := chromedp.NewContext(allocCtx)
defer cancel()
var imageUrls []string
if err := chromedp.Run(ctx,
chromedp.Navigate("https://example.com"),
chromedp.WaitReady("img"),
// img.src returns fully-resolved absolute URLs, so no relative URL handling needed.
chromedp.Evaluate(`
Array.from(document.querySelectorAll('img'))
.map(img => img.src)
.filter(src => src.startsWith('http'))
`, &imageUrls),
); err != nil {
panic(err)
}
fmt.Printf("Found %d images\n", len(imageUrls))
os.MkdirAll("images", 0755)
for i, url := range imageUrls {
resp, err := http.Get(url)
if err != nil {
continue
}
data, _ := io.ReadAll(resp.Body)
resp.Body.Close()
// Strip query strings before extracting the extension.
ext := filepath.Ext(strings.Split(url, "?")[0])
if ext == "" {
ext = ".jpg"
}
filename := fmt.Sprintf("images/image-%d%s", i, ext)
os.WriteFile(filename, data, 0644)
fmt.Println("Saved", filename)
}
}
3. Check the output
Run with go run main.go. All images are saved into an images/ directory.
Next steps
- Scrape Structured Data — extract text and structured data from pages
- Take a Screenshot — capture a visual snapshot of the full page