You need do it with multithreading.
"http://www.wikipedia.org/": ["http://www.wikipedia.org/help/"]
"http://www.wikipedia.org/help/": []
"http://www.wikipedia.org/": ["http://www.wikipedia.org/help/"]
"http://www.wikipedia.org/help/": ["http://www.wikipedia.org/", "http://www.wikipedia.org/about/"]
"http://www.wikipedia.org/about/": ["http://www.google.com/"]
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
import java.net.URL;
public class Solution {
/**
* @param url a url of root page
* @return all urls
*/
int tnum = 12;
public List<String> crawler(String url) {
BlockingQueue<String> q = new LinkedBlockingQueue<>();
ConcurrentHashMap<String, String> visited = new ConcurrentHashMap<>();
List<String> result = new ArrayList<>();
try {
q.put(url);
visited.put(url, "");
Thread[] threads = new Thread[tnum];
for (int i = 0; i < tnum; i++) {
threads[i] = new Thread(new Crawler(q, visited));
threads[i].start();
}
for (int i = 0; i < tnum; i++) {
threads[i].join();
}
} catch (Exception e) {
System.out.println("thread interupt exception thrown");
}
for (String s : visited.keySet()) {
result.add(s);
}
return result;
}
}
class Crawler implements Runnable {
private final BlockingQueue<String> queue;
private final ConcurrentHashMap<String, String> visited;
public Crawler(BlockingQueue<String> q, ConcurrentHashMap<String, String> v) {
queue = q;
visited = v;
}
public void run() {
while (!queue.isEmpty()) {
try {
String cur = queue.take();
for (String nei : HtmlHelper.parseUrls(cur)) {
if (visited.containsKey(nei)) {
continue;
}
String domain = "";
URL netUrl = new URL(nei);
domain = netUrl.getHost();
if (!domain.endsWith("wikipedia.org")) {
continue;
}
queue.put(nei);
visited.put(nei, "");
}
} catch (Exception e) {
System.out.println(" exception thrown");
}
}
}
}