/*
 * Decompiled with CFR 0.152.
 */
package com.anahata.yp.scrape;

import com.anahata.util.net.HMAClient;
import com.anahata.yp.scrape.CategoryData;
import com.anahata.yp.scrape.ExcelReport;
import com.anahata.yp.scrape.model.Category;
import com.anahata.yp.scrape.model.Company;
import java.beans.ConstructorProperties;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.TreeSet;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.commons.httpclient.ConnectionPoolTimeoutException;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpConnectionManager;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class YellowPagesWebScraper {
    private static final Logger log = LoggerFactory.getLogger(YellowPagesWebScraper.class);
    private static final String RESULTS_COUNT_START = YellowPagesWebScraper.normalize("<span class='emphasise'>");
    private static final String RESULTS_COUNT_END = YellowPagesWebScraper.normalize(" Results </span>for <h1>");
    private static final String NAME_START = YellowPagesWebScraper.normalize("<meta itemprop=\"name\" content=\"");
    private static final String NAME_END = YellowPagesWebScraper.normalize("\"/>");
    private static final String CATEGORY_START = YellowPagesWebScraper.normalize("data-heading-name='");
    private static final String CATEGORY_END = YellowPagesWebScraper.normalize("'");
    private static final String EMAIL_START = YellowPagesWebScraper.normalize("data-email='");
    private static final String EMAIL_END = YellowPagesWebScraper.normalize("'");
    private static final String WEBSITE_START = YellowPagesWebScraper.normalize("' rel='nofollow' target='_blank' title='");
    private static final String WEBSITE_END = YellowPagesWebScraper.normalize(" (opens in a new window");
    private static final String PHONE_START = YellowPagesWebScraper.normalize("<a href='tel:");
    private static final String PHONE_END = YellowPagesWebScraper.normalize("'");
    private static final String ADDRESS_START = YellowPagesWebScraper.normalize("data-address-line='");
    private static final String ADDRESS_END = YellowPagesWebScraper.normalize("'");
    private static final String LOCALITY_STATE_START = YellowPagesWebScraper.normalize("data-address-suburb='");
    private static final String LOCALITY_STATE_END = YellowPagesWebScraper.normalize("'");
    private static final String LOGO_START = YellowPagesWebScraper.normalize("src=\"http://www.yellowpages.com.au/content/if/PRODUCTS/");
    private static final String LOGO_END = YellowPagesWebScraper.normalize("\"");
    private static final String RESULTS_START = YellowPagesWebScraper.normalize("platinum-ads clearfix");
    private static final String ITEM_START = YellowPagesWebScraper.normalize("data-business-name='");
    private static final String ITEM_END = YellowPagesWebScraper.normalize("Send to mobile");
    private static final String COMPANY_ID_START = "listingId=";
    private static final String COMPANY_ID_END = "&";
    private static final String STATE = "qld";
    private static final String DOWNLOAD_URL = "http://www.yellowpages.com.au/search/listings?showAllLocations=false&headingCode=19682&visitedIAPages=1&visitedIAPages=3&referredBy=YOL&eventType=pagination&selectedViewMode=list&stateId=3&clue=%s&context=businessTypeSearch&pageNumber=%s&lruPageNumber=1&locationClue=qld&toggleViewTriggeredCount=0";
    private static final File WORKING_DIR = new File("e:\\yp", "qld");
    private static final File DOWNLOAD_DIR = new File(WORKING_DIR, "downloads");
    private static final File TARGET_FILE = new File(WORKING_DIR, "yp-all-qld.xlsx");
    private static final File TARGET_FILE_EMAIL_ONLY = new File(WORKING_DIR, "yp-email-qld.xlsx");
    private static final File TARGET_SER_FILE = new File(WORKING_DIR, "companies.ser");
    private static final String UNUSUAL = "unusual traffic activity";
    private static final HttpClient client = new HttpClient((HttpConnectionManager)new MultiThreadedHttpConnectionManager());
    public static ExecutorService THREAD_POOL = Executors.newCachedThreadPool();
    public static boolean categoryEnded = false;
    public static HMAClient hmaClient = new HMAClient("C:\\Program Files (x86)\\HMA! Pro VPN\\bin\\HMA! Pro VPN.exe");
    private static int realRequestCount = 0;

    public static void main(String[] args) throws Exception {
        log.debug("download dir is {}", (Object)DOWNLOAD_DIR.toURI().toURL());
        DOWNLOAD_DIR.mkdirs();
        TARGET_FILE.delete();
        TreeSet<Company> all = null;
        if (!TARGET_SER_FILE.exists()) {
            hmaClient.connect();
            client.getParams().setCookiePolicy("compatibility");
            client.getParams().setConnectionManagerTimeout(15000L);
            client.getHttpConnectionManager().getParams().setSoTimeout(15000);
            all = new TreeSet<Company>();
            int idx = 0;
            List<Category> cats = CategoryData.getCategories();
            log.debug("Categories: {}", (Object)cats.size());
            for (Category category : cats) {
                log.debug("---Starting Category: {} {} / {}", new Object[]{category.getName(), ++idx, cats.size()});
                try {
                    List<Company> cat = YellowPagesWebScraper.findByCategory(category);
                    all.addAll(cat);
                }
                catch (Throwable e) {
                    e.printStackTrace(System.out);
                }
            }
            log.debug("Serializing data");
            FileOutputStream fos = new FileOutputStream(TARGET_SER_FILE);
            ObjectOutputStream oos = new ObjectOutputStream(fos);
            oos.writeObject(all);
            oos.close();
        } else {
            log.debug("Reading serialized data");
            FileInputStream fis = new FileInputStream(TARGET_SER_FILE);
            ObjectInputStream ois = new ObjectInputStream(fis);
            all = (TreeSet<Company>)ois.readObject();
            ois.close();
        }
        log.debug("Generating report with all companies");
        new ExcelReport().deleteExcelAndCsv(TARGET_FILE.getParentFile());
        new ExcelReport().runCompanies(all, TARGET_FILE);
        Iterator it = all.iterator();
        while (it.hasNext()) {
            Company c = (Company)it.next();
            if (!StringUtils.isEmpty((CharSequence)c.getEmail())) continue;
            it.remove();
        }
        log.debug("Generating report with companies that have email only (" + all.size() + ")");
        new ExcelReport().runCompanies(all, TARGET_FILE_EMAIL_ONLY);
        log.debug("Report generated");
    }

    public static List<Company> findByCategory(Category category) throws Exception {
        ArrayList<Company> ret = new ArrayList<Company>();
        String count = YellowPagesWebScraper.grab(YellowPagesWebScraper.getPage(category, 1), RESULTS_COUNT_START, RESULTS_COUNT_END);
        if (count == null) {
            log.error("Could not find page results count for category {}", (Object)category);
            return Collections.emptyList();
        }
        int results = Integer.parseInt(count);
        int totalPages = results / 40 + 1;
        log.debug("Expected pages: {}", (Object)totalPages);
        int availablePages = Math.min(totalPages, 25);
        categoryEnded = false;
        for (int page = 1; ret.size() < results && page <= availablePages; ++page) {
            String html = YellowPagesWebScraper.getPage(category, page);
            try {
                List<Company> pageResults = YellowPagesWebScraper.parsePage(html, page, category);
                ret.addAll(pageResults);
                continue;
            }
            catch (Exception e) {
                log.debug("Exception parsing page " + page, (Throwable)e);
            }
            if (!categoryEnded) continue;
            break;
        }
        return ret;
    }

    public static String getPage(Category category, int page) throws Exception {
        File downloadLocation = new File(DOWNLOAD_DIR, URLEncoder.encode(category.getName(), "UTF-8") + page + ".html");
        String url = String.format(DOWNLOAD_URL, URLEncoder.encode(category.getName(), "UTF-8"), page);
        if (!downloadLocation.exists() || downloadLocation.length() == 0L) {
            downloadLocation.delete();
            log.debug("Downloading {} to {}", (Object)url, (Object)downloadLocation);
            String file = YellowPagesWebScraper.download(url);
            if (file.contains(UNUSUAL)) {
                downloadLocation.delete();
                throw new IllegalStateException("Unusual activity downloading " + category.getName() + " page " + page);
            }
            FileUtils.writeStringToFile((File)downloadLocation, (String)file);
            return YellowPagesWebScraper.normalize(file);
        }
        log.debug("File already downloaded {}", (Object)downloadLocation);
        String ret = FileUtils.readFileToString((File)downloadLocation);
        if (ret.contains(UNUSUAL)) {
            downloadLocation.delete();
        }
        log.debug("File size: {}", (Object)ret.length());
        return YellowPagesWebScraper.normalize(ret);
    }

    public static List<Company> parsePage(String html, int page, Category category) throws Exception {
        ArrayList<Company> ret = new ArrayList<Company>();
        int resultsStart = html.indexOf(RESULTS_START);
        log.debug("Results start at: {}", (Object)resultsStart);
        if (resultsStart == -1) {
            log.error("Could not find results start tag on page: {} html size: {}", (Object)page, (Object)html.length());
            log.error(html);
            return Collections.emptyList();
        }
        html = html.substring(resultsStart);
        int pos = html.indexOf(ITEM_START);
        System.out.println("Found 1st item start at " + pos);
        while (pos != -1) {
            int itemEnd = html.indexOf(ITEM_END, pos);
            if (itemEnd == -1) {
                System.out.println("Could not find ITEM_END after " + ret.size());
                categoryEnded = true;
                break;
            }
            String chunk = html.substring(pos, itemEnd);
            String parsedCategory = YellowPagesWebScraper.grab(chunk, CATEGORY_START, CATEGORY_END);
            if (parsedCategory == null) {
                System.out.println("Could not find CATEGORY_START / CATEGORY_END " + CATEGORY_START + "/" + CATEGORY_END + " in " + chunk);
                categoryEnded = true;
                break;
            }
            parsedCategory = StringEscapeUtils.unescapeHtml4((String)parsedCategory);
            parsedCategory = StringEscapeUtils.unescapeHtml4((String)parsedCategory);
            if (!category.getName().equals(parsedCategory)) {
                System.out.println("Category doesn't match: " + parsedCategory + " / " + category.getName());
                categoryEnded = true;
                break;
            }
            Company c = new Company();
            c.setId(YellowPagesWebScraper.grab(chunk, COMPANY_ID_START, COMPANY_ID_END));
            c.setName(YellowPagesWebScraper.grab(chunk, NAME_START, NAME_END));
            if (c.getName() == null) {
                System.err.println("Could not find name in " + chunk);
            }
            c.setEmail(YellowPagesWebScraper.grab(chunk, EMAIL_START, EMAIL_END));
            if (c.getEmail() == null || !c.getEmail().contains("@")) {
                c.setEmail("");
            }
            c.setWeb(YellowPagesWebScraper.grab(chunk, WEBSITE_START, WEBSITE_END));
            if (c.getWeb() == null || !c.getWeb().contains(".")) {
                c.setWeb("");
            }
            c.setPhone(YellowPagesWebScraper.grab(chunk, PHONE_START, PHONE_END));
            c.setAddress(YellowPagesWebScraper.grab(chunk, ADDRESS_START, ADDRESS_END));
            String localityState = YellowPagesWebScraper.grab(chunk, LOCALITY_STATE_START, LOCALITY_STATE_END);
            if (localityState != null) {
                String state = localityState.substring(localityState.lastIndexOf(" "));
                String locality = localityState.substring(0, localityState.lastIndexOf(" ")).trim();
                c.setLocality(locality);
                c.setState(state);
            }
            String addressLocalityState = c.getAddress() + ", " + localityState;
            String postCode = YellowPagesWebScraper.grab(chunk, addressLocalityState.trim() + " ", "</p>");
            c.setPostCode(postCode);
            c.setCategory(category);
            c.setLogo(YellowPagesWebScraper.grab(chunk, LOGO_START, LOGO_END));
            if (c.getLogo() != null) {
                if (!c.getLogo().contains("/")) {
                    c.setLogo("");
                } else {
                    c.setLogo("http://www.yellowpages.com.au/content/if/PRODUCTS/" + c.getLogo());
                }
            } else {
                c.setLogo("");
            }
            if (c.getName() == null) {
                log.debug("Got null name after {}", (Object)ret.size());
                break;
            }
            ret.add(c);
            log.debug("Parsed " + c);
            pos = html.indexOf(ITEM_START, pos + 1);
        }
        log.debug("Page {} Returning: {} companies", (Object)page, (Object)ret.size());
        return ret;
    }

    public static String download(String url) throws Exception {
        long pause = 4000L + (long)(Math.random() * 4000.0);
        log.debug("Sleeping for " + pause + " realRequestCount: " + realRequestCount);
        Thread.sleep(pause);
        log.debug("Woke up");
        GetMethod method = new GetMethod(url);
        method.setRequestHeader("user-agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)");
        int attempt = 0;
        while (true) {
            try {
                client.executeMethod((HttpMethod)method);
            }
            catch (ConnectionPoolTimeoutException e) {
                log.error("Connection timed out at attempt " + attempt, (Throwable)e);
                if (attempt > 0) {
                    if (attempt % 2 == 0) {
                        log.debug("Timeout after {} attempts will clear cookies", (Object)attempt);
                        client.getState().clearCookies();
                    }
                    if (attempt % 4 == 0) {
                        return null;
                    }
                }
                ++attempt;
                continue;
            }
            break;
        }
        String ret = method.getResponseBodyAsString();
        log.debug("request {} downloaded {}", (Object)realRequestCount, (Object)ret.length());
        if (ret.contains(UNUSUAL)) {
            log.debug("Unusual activity detected, renewing IP");
            try {
                client.getState().clearCookies();
                hmaClient.changeIp();
            }
            catch (Exception e) {
                log.error("Exception renewing ip", (Throwable)e);
            }
            return YellowPagesWebScraper.download(url);
        }
        if (realRequestCount % 100 == 0) {
            log.debug("Clearing cookies");
            client.getState().clearCookies();
        }
        return ret;
    }

    private static String grab(String text, String prefix, String sufix) {
        int prefixIdx = text.indexOf(prefix);
        if (prefixIdx != -1) {
            int tokenStart = prefixIdx + prefix.length();
            int tokenEnd = text.indexOf(sufix, tokenStart);
            if (tokenEnd == -1) {
                throw new RuntimeException("Could not find sufix");
            }
            String ret = text.substring(tokenStart, tokenEnd);
            ret = ret.replace("O&#039;", " ");
            ret = StringEscapeUtils.unescapeHtml4((String)ret);
            ret = StringEscapeUtils.unescapeHtml4((String)ret);
            return ret;
        }
        return null;
    }

    private static String normalize(String html) {
        html = html.replace("\n", " ");
        html = html.replace("\t", " ");
        while (html.contains("  ")) {
            html = html.replace("  ", " ");
        }
        return html;
    }

    private static class PageGrab
    implements Callable<List<Company>> {
        Category category;
        int page;

        @Override
        public List<Company> call() throws Exception {
            try {
                String html = YellowPagesWebScraper.getPage(this.category, this.page);
                List<Company> pageResults = YellowPagesWebScraper.parsePage(html, this.page, this.category);
                return pageResults;
            }
            catch (Throwable e) {
                e.printStackTrace(System.out);
                return Collections.EMPTY_LIST;
            }
        }

        @ConstructorProperties(value={"category", "page"})
        public PageGrab(Category category, int page) {
            this.category = category;
            this.page = page;
        }
    }
}

