Lead Generation Guide

Extract business contacts and build qualified lead lists

What You'll Learn

  • Scrape business directories and listings
  • Extract contact information (emails, phones, addresses)
  • Build targeted lead databases
  • Enrich existing lead data
  • Export leads to CRM systems

Basic Lead Scraping

scrape_leads.py
python
from scrapehub import ScrapeHubClient
import pandas as pd

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

# Scrape business directory
result = client.scrape(
    url="https://business-directory.com/category/software-companies",
    engine="neural-x1",
    format="json"
)

# Process leads
leads = []
for business in result.data:
    lead = {
        'company_name': business.get('name'),
        'email': business.get('email'),
        'phone': business.get('phone'),
        'website': business.get('website'),
        'address': business.get('address'),
        'industry': business.get('category'),
        'description': business.get('description')
    }
    leads.append(lead)

# Export to CSV
df = pd.DataFrame(leads)
df.to_csv('leads.csv', index=False)

print(f"Extracted {len(leads)} leads")
print(f"Leads with emails: {df['email'].notna().sum()}")
print(f"Leads with phones: {df['phone'].notna().sum()}")

Multi-City Lead Generation

multi_city_leads.py
python
from scrapehub import AsyncScrapeHubClient
import asyncio
import pandas as pd

async def scrape_city_leads(client, city, category):
    """Scrape leads for a specific city"""
    result = await client.scrape(
        url=f"https://business-directory.com/{city}/{category}",
        engine="neural-x1",
        pagination={
            "enabled": True,
            "max_pages": 10
        }
    )

    # Add city to each lead
    for lead in result.data:
        lead['city'] = city

    return result.data

async def main():
    client = AsyncScrapeHubClient(api_key="sk_live_xxxx_449x")

    # Target cities
    cities = ['new-york', 'los-angeles', 'chicago', 'houston', 'phoenix']
    category = 'restaurants'

    # Scrape all cities concurrently
    tasks = [scrape_city_leads(client, city, category) for city in cities]
    results = await asyncio.gather(*tasks)

    # Combine all leads
    all_leads = []
    for city_leads in results:
        all_leads.extend(city_leads)

    # Export
    df = pd.DataFrame(all_leads)
    df.to_excel('multi_city_leads.xlsx', index=False)

    print(f"Total leads collected: {len(all_leads)}")
    print(f"\nLeads by city:")
    print(df['city'].value_counts())

asyncio.run(main())

Contact Enrichment

enrich_leads.py
python
from scrapehub import ScrapeHubClient
import pandas as pd

class LeadEnricher:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)

    def enrich_from_website(self, website_url):
        """Extract additional info from company website"""
        result = self.client.scrape(
            url=website_url,
            engine="neural-x1",
            render_js=True
        )

        if result.data:
            company_data = result.data[0]
            return {
                'email': company_data.get('contact_email'),
                'phone': company_data.get('phone'),
                'social_linkedin': company_data.get('linkedin_url'),
                'social_twitter': company_data.get('twitter_url'),
                'employee_count': company_data.get('employees'),
                'founded_year': company_data.get('founded')
            }

        return {}

    def enrich_leads(self, leads_df):
        """Enrich existing lead database with additional data"""
        enriched_leads = []

        for idx, lead in leads_df.iterrows():
            print(f"Enriching {idx + 1}/{len(leads_df)}: {lead['company_name']}")

            enriched = lead.to_dict()

            # Scrape company website if available
            if pd.notna(lead['website']):
                extra_data = self.enrich_from_website(lead['website'])
                enriched.update(extra_data)

            enriched_leads.append(enriched)

        return pd.DataFrame(enriched_leads)

# Usage
enricher = LeadEnricher("sk_live_xxxx_449x")

# Load existing leads
leads_df = pd.read_csv('leads.csv')

# Enrich with additional data
enriched_df = enricher.enrich_leads(leads_df.head(50))  # First 50 leads

# Save enriched data
enriched_df.to_csv('enriched_leads.csv', index=False)

print(f"\nEnrichment complete!")
print(f"Leads with LinkedIn: {enriched_df['social_linkedin'].notna().sum()}")
print(f"Leads with employee count: {enriched_df['employee_count'].notna().sum()}")

Industry-Specific Lead Lists

industry_leads.py
python
from scrapehub import ScrapeHubClient
import pandas as pd

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

# Target multiple industries
industries = [
    'software-development',
    'digital-marketing',
    'consulting',
    'real-estate',
    'healthcare'
]

all_leads = []

for industry in industries:
    print(f"\nScraping {industry}...")

    result = client.scrape(
        url=f"https://business-directory.com/category/{industry}",
        engine="neural-x1",
        pagination={
            "enabled": True,
            "max_pages": 20
        }
    )

    # Add industry tag
    for lead in result.data:
        lead['industry'] = industry
        all_leads.append(lead)

    print(f"  Found {len(result.data)} leads")

# Create DataFrame
df = pd.DataFrame(all_leads)

# Filter quality leads (have email AND phone)
quality_leads = df[df['email'].notna() & df['phone'].notna()]

print(f"\n=== Lead Generation Summary ===")
print(f"Total leads: {len(df)}")
print(f"Quality leads (email + phone): {len(quality_leads)}")
print(f"\nLeads by industry:")
print(df['industry'].value_counts())

# Export both sets
df.to_excel('all_leads.xlsx', index=False, sheet_name='All Leads')
quality_leads.to_excel('quality_leads.xlsx', index=False, sheet_name='Quality Leads')

print("\nExported to all_leads.xlsx and quality_leads.xlsx")

LinkedIn Company Scraping

linkedin_leads.py
python
from scrapehub import ScrapeHubClient
import pandas as pd

client = ScrapeHubClient(api_key="sk_live_xxxx_449x")

def scrape_linkedin_companies(search_query, location=None):
    """Scrape companies from LinkedIn search"""

    # Build search URL
    url = f"https://www.linkedin.com/search/results/companies/?keywords={search_query}"
    if location:
        url += f"&location={location}"

    result = client.scrape(
        url=url,
        engine="neural-x1",
        render_js=True,
        wait_for_selector=".search-results-container",
        pagination={
            "enabled": True,
            "max_pages": 5
        }
    )

    return result.data

# Search for SaaS companies in San Francisco
companies = scrape_linkedin_companies(
    search_query="SaaS software",
    location="San Francisco Bay Area"
)

# Process and export
df = pd.DataFrame(companies)

print(f"Found {len(companies)} companies")
print(f"\nCompany size distribution:")
if 'company_size' in df.columns:
    print(df['company_size'].value_counts())

# Export
df.to_csv('linkedin_companies.csv', index=False)
print("\nExported to linkedin_companies.csv")

Email Finder & Validation

email_finder.py
python
from scrapehub import ScrapeHubClient
import pandas as pd
import re

class EmailFinder:
    def __init__(self, api_key):
        self.client = ScrapeHubClient(api_key=api_key)

    def find_emails_on_website(self, url):
        """Find all email addresses on a website"""
        result = self.client.scrape(
            url=url,
            engine="neural-x1",
            render_js=True
        )

        emails = set()
        email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

        if result.data:
            # Extract from structured data
            for item in result.data:
                if isinstance(item, dict):
                    for value in item.values():
                        if isinstance(value, str):
                            found = re.findall(email_pattern, value)
                            emails.update(found)

        return list(emails)

    def find_contact_page_emails(self, domain):
        """Check contact/about pages for emails"""
        contact_pages = [
            f"https://{domain}/contact",
            f"https://{domain}/about",
            f"https://{domain}/contact-us",
            f"https://{domain}/team"
        ]

        all_emails = []

        for page in contact_pages:
            try:
                emails = self.find_emails_on_website(page)
                all_emails.extend(emails)
                if emails:
                    print(f"  Found {len(emails)} emails on {page}")
            except:
                continue

        return list(set(all_emails))  # Remove duplicates

# Usage
finder = EmailFinder("sk_live_xxxx_449x")

# Load leads without emails
leads_df = pd.read_csv('leads.csv')
missing_email = leads_df[leads_df['email'].isna()]

print(f"Finding emails for {len(missing_email)} leads...")

for idx, lead in missing_email.iterrows():
    if pd.notna(lead['website']):
        print(f"\n{lead['company_name']}")

        # Extract domain
        domain = lead['website'].replace('http://', '').replace('https://', '').split('/')[0]

        # Find emails
        emails = finder.find_contact_page_emails(domain)

        if emails:
            print(f"  ✓ Found: {', '.join(emails)}")
            leads_df.at[idx, 'email'] = emails[0]  # Use first email
        else:
            print(f"  ✗ No emails found")

# Save updated leads
leads_df.to_csv('leads_with_emails.csv', index=False)
print(f"\nUpdated leads saved to leads_with_emails.csv")

CRM Integration (Salesforce)

export_to_crm.py
python
from scrapehub import ScrapeHubClient
from simple_salesforce import Salesforce
import pandas as pd

class CRMExporter:
    def __init__(self, scrapehub_api_key, sf_username, sf_password, sf_token):
        self.scraper = ScrapeHubClient(api_key=scrapehub_api_key)
        self.sf = Salesforce(
            username=sf_username,
            password=sf_password,
            security_token=sf_token
        )

    def scrape_and_export(self, url, lead_source):
        """Scrape leads and export to Salesforce"""

        # Scrape leads
        result = self.scraper.scrape(
            url=url,
            engine="neural-x1",
            pagination={"enabled": True, "max_pages": 10}
        )

        print(f"Scraped {len(result.data)} leads")

        # Export to Salesforce
        success_count = 0
        for lead in result.data:
            try:
                self.sf.Lead.create({
                    'Company': lead.get('company_name'),
                    'LastName': lead.get('contact_name', 'Unknown'),
                    'Email': lead.get('email'),
                    'Phone': lead.get('phone'),
                    'Website': lead.get('website'),
                    'Street': lead.get('address'),
                    'Industry': lead.get('industry'),
                    'LeadSource': lead_source,
                    'Description': lead.get('description'),
                    'Status': 'New'
                })
                success_count += 1
            except Exception as e:
                print(f"Error creating lead: {e}")

        print(f"\nSuccessfully exported {success_count}/{len(result.data)} leads to Salesforce")

# Usage
exporter = CRMExporter(
    scrapehub_api_key="sk_live_xxxx_449x",
    sf_username="your-sf-username",
    sf_password="your-sf-password",
    sf_token="your-sf-token"
)

# Scrape and export
exporter.scrape_and_export(
    url="https://business-directory.com/category/saas-companies",
    lead_source="Web Scraping - Business Directory"
)

Best Practices

  • Always verify email addresses before using them for outreach
  • Comply with GDPR, CCPA, and local data privacy regulations
  • Use pagination to collect comprehensive lead lists
  • Enrich leads with social media profiles for better targeting
  • Segment leads by industry, location, or company size
  • Regularly update lead data to maintain accuracy

Common Data Sources

Business Directories

  • Yellow Pages
  • Yelp
  • Google My Business
  • Industry-specific directories

Professional Networks

  • LinkedIn Companies
  • Crunchbase
  • AngelList
  • Product Hunt

Review Sites

  • G2
  • Capterra
  • TrustPilot
  • Clutch

Industry Platforms

  • Trade associations
  • Conference attendees
  • Award winners
  • Press releases