E-Commerce Scraping Guide

Extract product data, prices, and inventory from online stores

What You'll Learn

  • Scrape product catalogs and listings
  • Extract prices, ratings, and reviews
  • Handle pagination and dynamic content
  • Monitor inventory and price changes
  • Export data for analysis

Basic Product Scraping

scrape_products.py
python
from scrapehub import ScrapeHubClient

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

# Scrape product listing page
result = client.scrape(
    url="https://example-shop.com/products",
    engine="neural-x1",
    format="json"
)

# Process products
for product in result.data:
    print(f"Product: {product['name']}")
    print(f"Price: ${product['price']}")
    print(f"Rating: {product['rating']}/5")
    print(f"In Stock: {product['in_stock']}")
    print(f"URL: {product['url']}")
    print("-" * 50)

Multi-Page Scraping

scrape_with_pagination.py
python
from scrapehub import ScrapeHubClient

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

# Scrape multiple pages automatically
result = client.scrape(
    url="https://example-shop.com/products",
    engine="neural-x1",
    pagination={
        "enabled": True,
        "max_pages": 50,
        "selector": "a.next-page"  # CSS selector for "Next" button
    }
)

print(f"Total pages scraped: {result.pages_scraped}")
print(f"Total products found: {len(result.data)}")

# Save to CSV for analysis
import pandas as pd

df = pd.DataFrame(result.data)
df.to_csv('products.csv', index=False)
print("Data exported to products.csv")

Scraping Product Details

scrape_product_details.py
python
from scrapehub import ScrapeHubClient
import pandas as pd

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

# First, get product listing
listing_result = client.scrape(
    url="https://example-shop.com/category/electronics",
    engine="neural-x1"
)

# Then scrape each product detail page
detailed_products = []

for product in listing_result.data[:10]:  # First 10 products
    detail_result = client.scrape(
        url=product['url'],
        engine="neural-x1",
        render_js=True,
        wait_for_selector=".product-details"
    )

    if detail_result.data:
        detailed_products.append(detail_result.data[0])

    print(f"Scraped: {product['name']}")

# Export detailed data
df = pd.DataFrame(detailed_products)
df.to_excel('detailed_products.xlsx', index=False)
print(f"Exported {len(detailed_products)} detailed product records")

Price Monitoring System

price_monitor.py
python
from scrapehub import ScrapeHubClient
import pandas as pd
from datetime import datetime

class PriceMonitor:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)
        self.history_file = 'price_history.csv'

    def check_prices(self, urls):
        """Check current prices for list of product URLs"""
        results = []

        for url in urls:
            result = self.client.scrape(
                url=url,
                engine="neural-x1"
            )

            if result.data:
                product = result.data[0]
                results.append({
                    'timestamp': datetime.now(),
                    'url': url,
                    'name': product.get('name'),
                    'price': product.get('price'),
                    'in_stock': product.get('in_stock', True)
                })

        return pd.DataFrame(results)

    def save_history(self, new_data):
        """Append new price data to history"""
        try:
            history = pd.read_csv(self.history_file)
            history = pd.concat([history, new_data], ignore_index=True)
        except FileNotFoundError:
            history = new_data

        history.to_csv(self.history_file, index=False)
        print(f"Saved {len(new_data)} price records")

    def get_price_changes(self):
        """Detect price changes since last check"""
        history = pd.read_csv(self.history_file)
        history['timestamp'] = pd.to_datetime(history['timestamp'])

        # Get latest prices for each URL
        latest = history.sort_values('timestamp').groupby('url').tail(2)

        changes = []
        for url in latest['url'].unique():
            url_data = latest[latest['url'] == url]
            if len(url_data) == 2:
                old_price = url_data.iloc[0]['price']
                new_price = url_data.iloc[1]['price']

                if old_price != new_price:
                    changes.append({
                        'name': url_data.iloc[1]['name'],
                        'url': url,
                        'old_price': old_price,
                        'new_price': new_price,
                        'change': new_price - old_price,
                        'percent_change': ((new_price - old_price) / old_price) * 100
                    })

        return pd.DataFrame(changes)

# Usage
monitor = PriceMonitor("sk_live_xxxx_449x")

# Products to monitor
urls = [
    "https://example-shop.com/product/abc",
    "https://example-shop.com/product/xyz",
    "https://example-shop.com/product/123"
]

# Check current prices
current_prices = monitor.check_prices(urls)
monitor.save_history(current_prices)

# Detect changes
changes = monitor.get_price_changes()
if not changes.empty:
    print("\nPrice Changes Detected:")
    for _, change in changes.iterrows():
        print(f"\n{change['name']}")
        print(f"  Old: $${change['old_price']:.2f}")
        print(f"  New: $${change['new_price']:.2f}")
        print(f"  Change: $${change['change']:.2f} ({change['percent_change']:.1f}%)")

Competitor Analysis

competitor_analysis.py
python
from scrapehub import AsyncScrapeHubClient
import asyncio
import pandas as pd

async def analyze_competitors():
    client = AsyncScrapeHubClient(api_key="sk_live_xxxx_449x")

    # Competitor URLs for same product category
    competitors = {
        'Store A': 'https://store-a.com/category/laptops',
        'Store B': 'https://store-b.com/laptops',
        'Store C': 'https://store-c.com/computers/laptops'
    }

    # Scrape all competitors concurrently
    tasks = []
    for store_name, url in competitors.items():
        task = client.scrape(
            url=url,
            engine="neural-x1",
            pagination={"enabled": True, "max_pages": 5}
        )
        tasks.append((store_name, task))

    # Collect results
    all_products = []

    for store_name, task in tasks:
        result = await task
        for product in result.data:
            product['store'] = store_name
            all_products.append(product)

        print(f"{store_name}: {len(result.data)} products")

    return pd.DataFrame(all_products)

# Run analysis
df = asyncio.run(analyze_competitors())

# Price comparison
print("\nPrice Statistics by Store:")
print(df.groupby('store')['price'].describe())

# Find cheapest options
print("\n=== Cheapest Products ===")
for product_name in df['name'].unique()[:5]:
    product_df = df[df['name'] == product_name]
    cheapest = product_df.loc[product_df['price'].idxmin()]
    print(f"\n{product_name}")
    print(f"  Cheapest at: {cheapest['store']} - ${cheapest['price']}")

# Export for further analysis
df.to_excel('competitor_analysis.xlsx', index=False)
print("\nFull analysis exported to competitor_analysis.xlsx")

Automated Inventory Monitoring

inventory_monitor.py
python
from scrapehub import ScrapeHubClient
import smtplib
from email.mime.text import MIMEText
from datetime import datetime

class InventoryMonitor:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)

    def check_availability(self, products):
        """Check if products are in stock"""
        alerts = []

        for product in products:
            result = self.client.scrape(
                url=product['url'],
                engine="neural-x1"
            )

            if result.data:
                item = result.data[0]
                in_stock = item.get('in_stock', False)

                if in_stock and not product.get('was_in_stock', False):
                    # Product back in stock!
                    alerts.append({
                        'type': 'back_in_stock',
                        'name': item['name'],
                        'price': item['price'],
                        'url': product['url']
                    })

                # Update status
                product['was_in_stock'] = in_stock
                product['last_checked'] = datetime.now()

        return alerts

    def send_alert(self, alerts, email_to):
        """Send email notification for stock alerts"""
        if not alerts:
            return

        message = "Stock Alerts:\n\n"
        for alert in alerts:
            message += f"✅ {alert['name']} is back in stock!\n"
            message += f"   Price: ${alert['price']}\n"
            message += f"   URL: {alert['url']}\n\n"

        msg = MIMEText(message)
        msg['Subject'] = f"Stock Alert: {len(alerts)} items available"
        msg['From'] = "alerts@yourdomain.com"
        msg['To'] = email_to

        # Send email (configure SMTP settings)
        # smtp = smtplib.SMTP('smtp.gmail.com', 587)
        # smtp.starttls()
        # smtp.login('your_email', 'your_password')
        # smtp.send_message(msg)
        # smtp.quit()

        print(f"Alert sent: {len(alerts)} items back in stock")

# Usage
monitor = InventoryMonitor("sk_live_xxxx_449x")

# Products to watch
watchlist = [
    {'url': 'https://example-shop.com/product/limited-edition-1', 'was_in_stock': False},
    {'url': 'https://example-shop.com/product/limited-edition-2', 'was_in_stock': False},
]

# Check availability
alerts = monitor.check_availability(watchlist)

# Send notifications
if alerts:
    monitor.send_alert(alerts, "your-email@example.com")
    print(f"Found {len(alerts)} products back in stock!")
else:
    print("No new items in stock")

Best Practices

  • Use pagination to scrape entire catalogs efficiently
  • Enable JavaScript rendering for dynamic price updates
  • Set up webhooks for long-running scrape jobs
  • Schedule regular checks to monitor price changes
  • Respect rate limits - use async jobs for bulk operations
  • Store historical data for trend analysis

Common Use Cases

Price Intelligence

  • Track competitor pricing
  • Dynamic pricing strategies
  • Historical price trends

Inventory Management

  • Stock availability monitoring
  • Restock alerts
  • Product lifecycle tracking

Market Research

  • Product catalog analysis
  • Category trends
  • Brand monitoring

Review Analysis

  • Customer sentiment
  • Rating aggregation
  • Competitor comparison