package spider; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Queue; /** * Downloads web page content starting with a starting url. * If the spider encounters links in the content, it downloads * those as well. * * Steps: * 1. Complete the processPage method. One TestSpider unit tests should pass. * 2. Complete the crawl() method. Both TestSpider unit tests should pass. * * @author shilad * */ public class Spider { /** * Urls waiting to be scraped. The "work" left to do. */ private Queue work = new LinkedList(); /** * Keeps track of counts for each url. */ private AllWordsCounter urlCounter = new AllWordsCounter(); /** * Maximum number of urls that should be scraped. */ private int maxUrls = 100; /** * URLs that have already been retrieved. */ private List finished = new ArrayList(); /** * Helps download and parse the web pages. */ private HttpHelper helper = new HttpHelper(); /** * Creates a new spider that will crawl at most maxUrls. * @param maxUrls Maximum number of URLs to crawl. */ public Spider(int maxUrls) { this.maxUrls = maxUrls; } /** * Crawls at most maxUrls starting with beginningUrl. * @param beginningUrl Starting URL, indicating a web page that potentially contains other URLs. */ public void crawl(String beginningUrl) { work.add(beginningUrl); // TODO: While there is remaining work and we haven't // reach the maximum # of finished urls, process // the next unfinshed url. After processing, mark // it as finished. } /** * Retrieves content from a url and processes that content. * @param url A URL to process. */ public void processPage(String url) { String html = helper.retrieve(url); // TODO: extract all the links from the url // For each link that isn't an image, increment the // count for that link and queue up the link for future scraping. // HINT: Take a look at the helper class } /** * Returns the number of times the spider encountered * links to each url. The url are returned in increasing * frequency order. * * @return Number of URLs encountered. */ public WordCount[] getUrlCounts() { return urlCounter.getCounts(); } /** Getter only to be used for testing. @return The state variable work */ Queue getWork() { return work; } /** Getter only to be used for testing. @return The state variable finished */ List getFinished() { return finished; } }