Changeset f25a924
- Timestamp:
- 22/04/2012 16:27:16 (10 years ago)
- Branches:
- master
- Children:
- 3bcb813
- Parents:
- a3fa073
- git-author:
- Eric van der Vlist <vdv@dyomedea.com> (22/04/2012 16:27:16)
- git-committer:
- Eric van der Vlist <vdv@dyomedea.com> (22/04/2012 16:27:16)
- Location:
- archiver/pipelines/actions
- Files:
-
- 1 added
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
archiver/pipelines/actions/cxml.xslt
r57daa70 rf25a924 1 1 <?xml version="1.0" encoding="UTF-8"?> 2 <!-- 3 HERITRIX 3 CRAWL JOB CONFIGURATION FILE 4 5 This is a relatively minimal configuration suitable for many crawls. 6 7 Commented-out beans and properties are provided as an example; values 8 shown in comments reflect the actual defaults which are in effect 9 if not otherwise specified specification. (To change from the default 10 behavior, uncomment AND alter the shown values.) 11 --> 12 <beans xsl:version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" 13 xmlns="http://www.springframework.org/schema/beans" 14 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 15 xmlns:context="http://www.springframework.org/schema/context" 16 xmlns:aop="http://www.springframework.org/schema/aop" 17 xmlns:tx="http://www.springframework.org/schema/tx" 18 xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd 19 http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-3.0.xsd 20 http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.0.xsd 21 http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd"> 22 23 <context:annotation-config/> 2 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="2.0"> 24 3 25 <!-- 26 OVERRIDES 27 Values elsewhere in the configuration may be replaced ('overridden') 28 by a Properties map declared in a PropertiesOverrideConfigurer, 29 using a dotted-bean-path to address individual bean properties. 30 This allows us to collect a few of the most-often changed values 31 in an easy-to-edit format here at the beginning of the model 32 configuration. 33 --> 34 <!-- overrides from a text property list --> 35 <bean id="simpleOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer"> 36 <property name="properties"> 37 <value> 38 # This Properties map is specified in the Java 'property list' text format 39 # http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29 4 <xsl:variable name="action" select="/action"/> 40 5 41 metadata.operatorContactUrl=http://owark.org 42 metadata.jobName=basic 43 metadata.description=Basic crawl starting with useful defaults 6 <xsl:template match="/"> 7 <xsl:apply-templates select="doc('crawler-beans-template.cxml')/*"/> 8 </xsl:template> 44 9 45 ##..more?..## 46 </value> 47 </property> 48 </bean> 10 <xsl:template match="@* | node()"> 11 <xsl:copy> 12 <xsl:apply-templates select="@* | node()"/> 13 </xsl:copy> 14 </xsl:template> 49 15 50 <!-- overrides from declared <prop> elements, more easily allowing 51 multiline values or even declared beans --> 52 <bean id="longerOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer"> 53 <property name="properties"> 54 <props> 55 <prop key="seeds.textSource.value"> 16 <xsl:template match="url"> 17 <xsl:value-of select="$action/@url"/> 18 </xsl:template> 56 19 57 <xsl:value-of select="/action/@url"/> 58 59 </prop> 60 </props> 61 </property> 62 </bean> 63 64 <!-- CRAWL METADATA: including identification of crawler/operator --> 65 <bean id="metadata" class="org.archive.modules.CrawlMetadata" autowire="byName"> 66 <property name="operatorContactUrl" value="[see override above]"/> 67 <property name="jobName" value="[see override above]"/> 68 <property name="description" value="[see override above]"/> 69 <!-- <property name="robotsPolicyName" value="obey"/> --> 70 <!-- <property name="operator" value=""/> --> 71 <!-- <property name="operatorFrom" value=""/> --> 72 <!-- <property name="organization" value=""/> --> 73 <!-- <property name="audience" value=""/> --> 74 <!-- <property name="userAgentTemplate" 75 value="Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)"/> --> 76 77 </bean> 78 79 <!-- SEEDS: crawl starting points 80 ConfigString allows simple, inline specification of a moderate 81 number of seeds; see below comment for example of using an 82 arbitrarily-large external file. --> 83 <bean id="seeds" class="org.archive.modules.seeds.TextSeedModule"> 84 <property name="textSource"> 85 <bean class="org.archive.spring.ConfigString"> 86 <property name="value"> 87 <value> 88 # [see override above] 89 </value> 90 </property> 91 </bean> 92 </property> 93 <!-- <property name='sourceTagSeeds' value='false'/> --> 94 <!-- <property name='blockAwaitingSeedLines' value='-1'/> --> 95 </bean> 96 97 <!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in 98 the job directory, similar to the H1 approach. 99 Use either the above, or this, but not both. --> 100 <!-- 101 <bean id="seeds" class="org.archive.modules.seeds.TextSeedModule"> 102 <property name="textSource"> 103 <bean class="org.archive.spring.ConfigFile"> 104 <property name="path" value="seeds.txt" /> 105 </bean> 106 </property> 107 <property name='sourceTagSeeds' value='false'/> 108 <property name='blockAwaitingSeedLines' value='-1'/> 109 </bean> 110 --> 111 112 <!-- SCOPE: rules for which discovered URIs to crawl; order is very 113 important because last decision returned other than 'NONE' wins. --> 114 <bean id="scope" class="org.archive.modules.deciderules.DecideRuleSequence"> 115 <!-- <property name="logToFile" value="false" /> --> 116 <property name="rules"> 117 <list> 118 <!-- Begin by REJECTing all... --> 119 <bean class="org.archive.modules.deciderules.RejectDecideRule"> 120 </bean> 121 <!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... --> 122 <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> 123 <!-- <property name="seedsAsSurtPrefixes" value="true" /> --> 124 <!-- <property name="alsoCheckVia" value="false" /> --> 125 <!-- <property name="surtsSourceFile" value="" /> --> 126 <!-- <property name="surtsDumpFile" value="${launchId}/surts.dump" /> --> 127 <!-- <property name="surtsSource"> 128 <bean class="org.archive.spring.ConfigString"> 129 <property name="value"> 130 <value> 131 # example.com 132 # http://www.example.edu/path1/ 133 # +http://(org,example, 134 </value> 135 </property> 136 </bean> 137 </property> --> 138 </bean> 139 <!-- ...but REJECT those more than a configured link-hop-count from start... --> 140 <bean class="org.archive.modules.deciderules.TooManyHopsDecideRule"> 141 <property name="maxHops" value="0" /> 142 </bean> 143 <!-- ...but ACCEPT those more than a configured link-hop-count from start... --> 144 <bean class="org.archive.modules.deciderules.TransclusionDecideRule"> 145 <!-- <property name="maxTransHops" value="2" /> --> 146 <!-- <property name="maxSpeculativeHops" value="1" /> --> 147 </bean> 148 <!-- ...but REJECT those from a configurable (initially empty) set of REJECT SURTs... --> 149 <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> 150 <property name="decision" value="REJECT"/> 151 <property name="seedsAsSurtPrefixes" value="false"/> 152 <property name="surtsDumpFile" value="${launchId}/negative-surts.dump" /> 153 <!-- <property name="surtsSource"> 154 <bean class="org.archive.spring.ConfigFile"> 155 <property name="path" value="negative-surts.txt" /> 156 </bean> 157 </property> --> 158 </bean> 159 <!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... --> 160 <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule"> 161 <property name="decision" value="REJECT"/> 162 <!-- <property name="listLogicalOr" value="true" /> --> 163 <!-- <property name="regexList"> 164 <list> 165 </list> 166 </property> --> 167 </bean> 168 <!-- ...and REJECT those with suspicious repeating path-segments... --> 169 <bean class="org.archive.modules.deciderules.PathologicalPathDecideRule"> 170 <!-- <property name="maxRepetitions" value="2" /> --> 171 </bean> 172 <!-- ...and REJECT those with more than threshold number of path-segments... --> 173 <bean class="org.archive.modules.deciderules.TooManyPathSegmentsDecideRule"> 174 <!-- <property name="maxPathDepth" value="20" /> --> 175 </bean> 176 <!-- ...but always ACCEPT those marked as prerequisitee for another URI... --> 177 <bean class="org.archive.modules.deciderules.PrerequisiteAcceptDecideRule"> 178 </bean> 179 <!-- ...but always REJECT those with unsupported URI schemes --> 180 <bean class="org.archive.modules.deciderules.SchemeNotInSetDecideRule"> 181 </bean> 182 </list> 183 </property> 184 </bean> 185 186 <!-- 187 PROCESSING CHAINS 188 Much of the crawler's work is specified by the sequential 189 application of swappable Processor modules. These Processors 190 are collected into three 'chains'. The CandidateChain is applied 191 to URIs being considered for inclusion, before a URI is enqueued 192 for collection. The FetchChain is applied to URIs when their 193 turn for collection comes up. The DispositionChain is applied 194 after a URI is fetched and analyzed/link-extracted. 195 --> 196 197 <!-- CANDIDATE CHAIN --> 198 <!-- first, processors are declared as top-level named beans --> 199 <bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper"> 200 </bean> 201 <bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer"> 202 <!-- <property name="preferenceDepthHops" value="-1" /> --> 203 <!-- <property name="preferenceEmbedHops" value="1" /> --> 204 <!-- <property name="canonicalizationPolicy"> 205 <ref bean="canonicalizationPolicy" /> 206 </property> --> 207 <!-- <property name="queueAssignmentPolicy"> 208 <ref bean="queueAssignmentPolicy" /> 209 </property> --> 210 <!-- <property name="uriPrecedencePolicy"> 211 <ref bean="uriPrecedencePolicy" /> 212 </property> --> 213 <!-- <property name="costAssignmentPolicy"> 214 <ref bean="costAssignmentPolicy" /> 215 </property> --> 216 </bean> 217 <!-- now, processors are assembled into ordered CandidateChain bean --> 218 <bean id="candidateProcessors" class="org.archive.modules.CandidateChain"> 219 <property name="processors"> 220 <list> 221 <!-- apply scoping rules to each individual candidate URI... --> 222 <ref bean="candidateScoper"/> 223 <!-- ...then prepare those ACCEPTed to be enqueued to frontier. --> 224 <ref bean="preparer"/> 225 </list> 226 </property> 227 </bean> 228 229 <!-- FETCH CHAIN --> 230 <!-- first, processors are declared as top-level named beans --> 231 <bean id="preselector" class="org.archive.crawler.prefetch.Preselector"> 232 <!-- <property name="recheckScope" value="false" /> --> 233 <!-- <property name="blockAll" value="false" /> --> 234 <!-- <property name="blockByRegex" value="" /> --> 235 <!-- <property name="allowByRegex" value="" /> --> 236 </bean> 237 <bean id="preconditions" class="org.archive.crawler.prefetch.PreconditionEnforcer"> 238 <!-- <property name="ipValidityDurationSeconds" value="21600" /> --> 239 <!-- <property name="robotsValidityDurationSeconds" value="86400" /> --> 240 <!-- <property name="calculateRobotsOnly" value="false" /> --> 241 </bean> 242 <bean id="fetchDns" class="org.archive.modules.fetcher.FetchDNS"> 243 <!-- <property name="acceptNonDnsResolves" value="false" /> --> 244 <!-- <property name="digestContent" value="true" /> --> 245 <!-- <property name="digestAlgorithm" value="sha1" /> --> 246 </bean> 247 <!-- <bean id="fetchWhois" class="org.archive.modules.fetcher.FetchWhois"> 248 <property name="specialQueryTemplates"> 249 <map> 250 <entry key="whois.verisign-grs.com" value="domain %s" /> 251 <entry key="whois.arin.net" value="z + %s" /> 252 <entry key="whois.denic.de" value="-T dn %s" /> 253 </map> 254 </property> 255 </bean> --> 256 <bean id="fetchHttp" class="org.archive.modules.fetcher.FetchHTTP"> 257 <!-- <property name="useHTTP11" value="false" /> --> 258 <!-- <property name="maxLengthBytes" value="0" /> --> 259 <!-- <property name="timeoutSeconds" value="1200" /> --> 260 <!-- <property name="maxFetchKBSec" value="0" /> --> 261 <!-- <property name="defaultEncoding" value="ISO-8859-1" /> --> 262 <!-- <property name="shouldFetchBodyRule"> 263 <bean class="org.archive.modules.deciderules.AcceptDecideRule"/> 264 </property> --> 265 <!-- <property name="soTimeoutMs" value="20000" /> --> 266 <!-- <property name="sendIfModifiedSince" value="true" /> --> 267 <!-- <property name="sendIfNoneMatch" value="true" /> --> 268 <!-- <property name="sendConnectionClose" value="true" /> --> 269 <!-- <property name="sendReferer" value="true" /> --> 270 <!-- <property name="sendRange" value="false" /> --> 271 <!-- <property name="ignoreCookies" value="false" /> --> 272 <!-- <property name="sslTrustLevel" value="OPEN" /> --> 273 <!-- <property name="acceptHeaders"> 274 <list> 275 <value>Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value> 276 </list> 277 </property> 278 --> 279 <!-- <property name="httpBindAddress" value="" /> --> 280 <!-- <property name="httpProxyHost" value="" /> --> 281 <!-- <property name="httpProxyPort" value="0" /> --> 282 <!-- <property name="httpProxyUser" value="" /> --> 283 <!-- <property name="httpProxyPassword" value="" /> --> 284 <!-- <property name="digestContent" value="true" /> --> 285 <!-- <property name="digestAlgorithm" value="sha1" /> --> 286 </bean> 287 <bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP"> 288 </bean> 289 <bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML"> 290 <!-- <property name="extractJavascript" value="true" /> --> 291 <!-- <property name="extractValueAttributes" value="true" /> --> 292 <!-- <property name="ignoreFormActionUrls" value="false" /> --> 293 <!-- <property name="extractOnlyFormGets" value="true" /> --> 294 <!-- <property name="treatFramesAsEmbedLinks" value="true" /> --> 295 <!-- <property name="ignoreUnexpectedHtml" value="true" /> --> 296 <!-- <property name="maxElementLength" value="1024" /> --> 297 <!-- <property name="maxAttributeNameLength" value="1024" /> --> 298 <!-- <property name="maxAttributeValueLength" value="16384" /> --> 299 </bean> 300 <bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS"> 301 </bean> 302 <bean id="extractorJs" class="org.archive.modules.extractor.ExtractorJS"> 303 </bean> 304 <bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF"> 305 </bean> 306 <!-- now, processors are assembled into ordered FetchChain bean --> 307 <bean id="fetchProcessors" class="org.archive.modules.FetchChain"> 308 <property name="processors"> 309 <list> 310 <!-- re-check scope, if so enabled... --> 311 <ref bean="preselector"/> 312 <!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... --> 313 <ref bean="preconditions"/> 314 <!-- ...fetch if DNS URI... --> 315 <ref bean="fetchDns"/> 316 <!-- <ref bean="fetchWhois"/> --> 317 <!-- ...fetch if HTTP URI... --> 318 <ref bean="fetchHttp"/> 319 <!-- ...extract outlinks from HTTP headers... --> 320 <ref bean="extractorHttp"/> 321 <!-- ...extract outlinks from HTML content... --> 322 <ref bean="extractorHtml"/> 323 <!-- ...extract outlinks from CSS content... --> 324 <ref bean="extractorCss"/> 325 <!-- ...extract outlinks from Javascript content... --> 326 <ref bean="extractorJs"/> 327 <!-- ...extract outlinks from Flash content... --> 328 <ref bean="extractorSwf"/> 329 </list> 330 </property> 331 </bean> 332 333 <!-- DISPOSITION CHAIN --> 334 <!-- first, processors are declared as top-level named beans --> 335 <bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor"> 336 <!-- <property name="compress" value="true" /> --> 337 <!-- <property name="prefix" value="IAH" /> --> 338 <!-- <property name="suffix" value="${HOSTNAME}" /> --> 339 <!-- <property name="maxFileSizeBytes" value="1000000000" /> --> 340 <!-- <property name="poolMaxActive" value="1" /> --> 341 <!-- <property name="MaxWaitForIdleMs" value="500" /> --> 342 <!-- <property name="skipIdenticalDigests" value="false" /> --> 343 <!-- <property name="maxTotalBytesToWrite" value="0" /> --> 344 <!-- <property name="directory" value="${launchId}" /> --> 345 <!-- <property name="storePaths"> 346 <list> 347 <value>warcs</value> 348 </list> 349 </property> --> 350 <!-- <property name="writeRequests" value="true" /> --> 351 <!-- <property name="writeMetadata" value="true" /> --> 352 <!-- <property name="writeRevisitForIdenticalDigests" value="true" /> --> 353 <!-- <property name="writeRevisitForNotModified" value="true" /> --> 354 </bean> 355 <bean id="candidates" class="org.archive.crawler.postprocessor.CandidatesProcessor"> 356 <!-- <property name="seedsRedirectNewSeeds" value="true" /> --> 357 </bean> 358 <bean id="disposition" class="org.archive.crawler.postprocessor.DispositionProcessor"> 359 <!-- <property name="delayFactor" value="5.0" /> --> 360 <!-- <property name="minDelayMs" value="3000" /> --> 361 <!-- <property name="respectCrawlDelayUpToSeconds" value="300" /> --> 362 <!-- <property name="maxDelayMs" value="30000" /> --> 363 <!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> --> 364 </bean> 365 <!-- <bean id="rescheduler" class="org.archive.crawler.postprocessor.ReschedulingProcessor"> 366 <property name="rescheduleDelaySeconds" value="-1" /> 367 </bean> --> 368 <!-- now, processors are assembled into ordered DispositionChain bean --> 369 <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain"> 370 <property name="processors"> 371 <list> 372 <!-- write to aggregate archival files... --> 373 <ref bean="warcWriter"/> 374 <!-- ...send each outlink candidate URI to CandidateChain, 375 and enqueue those ACCEPTed to the frontier... --> 376 <ref bean="candidates"/> 377 <!-- ...then update stats, shared-structures, frontier decisions --> 378 <ref bean="disposition"/> 379 <!-- <ref bean="rescheduler" /> --> 380 </list> 381 </property> 382 </bean> 383 384 <!-- CRAWLCONTROLLER: Control interface, unifying context --> 385 <bean id="crawlController" 386 class="org.archive.crawler.framework.CrawlController"> 387 <!-- <property name="maxToeThreads" value="25" /> --> 388 <!-- <property name="pauseAtStart" value="true" /> --> 389 <!-- <property name="runWhileEmpty" value="false" /> --> 390 <!-- <property name="recorderInBufferBytes" value="524288" /> --> 391 <!-- <property name="recorderOutBufferBytes" value="16384" /> --> 392 <!-- <property name="scratchDir" value="scratch" /> --> 393 </bean> 394 395 <!-- FRONTIER: Record of all URIs discovered and queued-for-collection --> 396 <bean id="frontier" 397 class="org.archive.crawler.frontier.BdbFrontier"> 398 <!-- <property name="queueTotalBudget" value="-1" /> --> 399 <!-- <property name="balanceReplenishAmount" value="3000" /> --> 400 <!-- <property name="errorPenaltyAmount" value="100" /> --> 401 <!-- <property name="precedenceFloor" value="255" /> --> 402 <!-- <property name="queuePrecedencePolicy"> 403 <bean class="org.archive.crawler.frontier.precedence.BaseQueuePrecedencePolicy" /> 404 </property> --> 405 <!-- <property name="snoozeLongMs" value="300000" /> --> 406 <!-- <property name="retryDelaySeconds" value="900" /> --> 407 <!-- <property name="maxRetries" value="30" /> --> 408 <!-- <property name="recoveryLogEnabled" value="true" /> --> 409 <!-- <property name="maxOutlinks" value="6000" /> --> 410 <!-- <property name="extractIndependently" value="false" /> --> 411 <!-- <property name="outbound"> 412 <bean class="java.util.concurrent.ArrayBlockingQueue"> 413 <constructor-arg value="200"/> 414 <constructor-arg value="true"/> 415 </bean> 416 </property> --> 417 <!-- <property name="inbound"> 418 <bean class="java.util.concurrent.ArrayBlockingQueue"> 419 <constructor-arg value="40000"/> 420 <constructor-arg value="true"/> 421 </bean> 422 </property> --> 423 <!-- <property name="dumpPendingAtClose" value="false" /> --> 424 </bean> 425 426 <!-- URI UNIQ FILTER: Used by frontier to remember already-included URIs --> 427 <bean id="uriUniqFilter" 428 class="org.archive.crawler.util.BdbUriUniqFilter"> 429 </bean> 430 431 <!-- 432 EXAMPLE SETTINGS OVERLAY SHEETS 433 Sheets allow some settings to vary by context - usually by URI context, 434 so that different sites or sections of sites can be treated differently. 435 Here are some example Sheets for common purposes. The SheetOverlaysManager 436 (below) automatically collects all Sheet instances declared among the 437 original beans, but others can be added during the crawl via the scripting 438 interface. 439 --> 440 441 <!-- forceRetire: any URI to which this sheet's settings are applied 442 will force its containing queue to 'retired' status. --> 443 <bean id='forceRetire' class='org.archive.spring.Sheet'> 444 <property name='map'> 445 <map> 446 <entry key='disposition.forceRetire' value='true'/> 447 </map> 448 </property> 449 </bean> 450 451 <!-- smallBudget: any URI to which this sheet's settings are applied 452 will give its containing queue small values for balanceReplenishAmount 453 (causing it to have shorter 'active' periods while other queues are 454 waiting) and queueTotalBudget (causing the queue to enter 'retired' 455 status once that expenditure is reached by URI attempts and errors) --> 456 <bean id='smallBudget' class='org.archive.spring.Sheet'> 457 <property name='map'> 458 <map> 459 <entry key='frontier.balanceReplenishAmount' value='20'/> 460 <entry key='frontier.queueTotalBudget' value='100'/> 461 </map> 462 </property> 463 </bean> 464 465 <!-- veryPolite: any URI to which this sheet's settings are applied 466 will cause its queue to take extra-long politeness snoozes --> 467 <bean id='veryPolite' class='org.archive.spring.Sheet'> 468 <property name='map'> 469 <map> 470 <entry key='disposition.delayFactor' value='10'/> 471 <entry key='disposition.minDelayMs' value='10000'/> 472 <entry key='disposition.maxDelayMs' value='1000000'/> 473 <entry key='disposition.respectCrawlDelayUpToSeconds' value='3600'/> 474 </map> 475 </property> 476 </bean> 477 478 <!-- highPrecedence: any URI to which this sheet's settings are applied 479 will give its containing queue a slightly-higher than default 480 queue precedence value. That queue will then be preferred over 481 other queues for active crawling, never waiting behind lower- 482 precedence queues. --> 483 <bean id='highPrecedence' class='org.archive.spring.Sheet'> 484 <property name='map'> 485 <map> 486 <entry key='frontier.balanceReplenishAmount' value='20'/> 487 <entry key='frontier.queueTotalBudget' value='100'/> 488 </map> 489 </property> 490 </bean> 491 492 <!-- 493 EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION 494 A SheetAssociation says certain URIs should have certain overlay Sheets 495 applied. This example applies two sheets to URIs matching two SURT-prefixes. 496 New associations may also be added mid-crawl using the scripting facility. 497 --> 498 499 <!-- 500 <bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'> 501 <property name='surtPrefixes'> 502 <list> 503 <value>http://(org,example,</value> 504 <value>http://(com,example,www,)/</value> 505 </list> 506 </property> 507 <property name='targetSheetNames'> 508 <list> 509 <value>veryPolite</value> 510 <value>smallBudget</value> 511 </list> 512 </property> 513 </bean> 514 --> 515 516 <!-- 517 OPTIONAL BUT RECOMMENDED BEANS 518 --> 519 520 <!-- ACTIONDIRECTORY: disk directory for mid-crawl operations 521 Running job will watch directory for new files with URIs, 522 scripts, and other data to be processed during a crawl. --> 523 <bean id="actionDirectory" class="org.archive.crawler.framework.ActionDirectory"> 524 <!-- <property name="actionDir" value="action" /> --> 525 <!-- <property name="doneDir" value="${launchId}/actions-done" /> --> 526 <!-- <property name="initialDelaySeconds" value="10" /> --> 527 <!-- <property name="delaySeconds" value="30" /> --> 528 </bean> 529 530 <!-- CRAWLLIMITENFORCER: stops crawl when it reaches configured limits --> 531 <bean id="crawlLimiter" class="org.archive.crawler.framework.CrawlLimitEnforcer"> 532 <!-- <property name="maxBytesDownload" value="0" /> --> 533 <!-- <property name="maxDocumentsDownload" value="0" /> --> 534 <!-- <property name="maxTimeSeconds" value="0" /> --> 535 </bean> 536 537 <!-- CHECKPOINTSERVICE: checkpointing assistance --> 538 <bean id="checkpointService" 539 class="org.archive.crawler.framework.CheckpointService"> 540 <!-- <property name="checkpointIntervalMinutes" value="-1"/> --> 541 <!-- <property name="checkpointsDir" value="checkpoints"/> --> 542 </bean> 543 544 <!-- 545 OPTIONAL BEANS 546 Uncomment and expand as needed, or if non-default alternate 547 implementations are preferred. 548 --> 549 550 <!-- CANONICALIZATION POLICY --> 551 <!-- 552 <bean id="canonicalizationPolicy" 553 class="org.archive.modules.canonicalize.RulesCanonicalizationPolicy"> 554 <property name="rules"> 555 <list> 556 <bean class="org.archive.modules.canonicalize.LowercaseRule" /> 557 <bean class="org.archive.modules.canonicalize.StripUserinfoRule" /> 558 <bean class="org.archive.modules.canonicalize.StripWWWNRule" /> 559 <bean class="org.archive.modules.canonicalize.StripSessionIDs" /> 560 <bean class="org.archive.modules.canonicalize.StripSessionCFIDs" /> 561 <bean class="org.archive.modules.canonicalize.FixupQueryString" /> 562 </list> 563 </property> 564 </bean> 565 --> 566 567 568 <!-- QUEUE ASSIGNMENT POLICY --> 569 <!-- 570 <bean id="queueAssignmentPolicy" 571 class="org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy"> 572 <property name="forceQueueAssignment" value="" /> 573 <property name="deferToPrevious" value="true" /> 574 <property name="parallelQueues" value="1" /> 575 </bean> 576 --> 577 578 <!-- URI PRECEDENCE POLICY --> 579 <!-- 580 <bean id="uriPrecedencePolicy" 581 class="org.archive.crawler.frontier.precedence.CostUriPrecedencePolicy"> 582 </bean> 583 --> 584 585 <!-- COST ASSIGNMENT POLICY --> 586 <!-- 587 <bean id="costAssignmentPolicy" 588 class="org.archive.crawler.frontier.UnitCostAssignmentPolicy"> 589 </bean> 590 --> 591 592 <!-- CREDENTIAL STORE: HTTP authentication or FORM POST credentials --> 593 <!-- 594 <bean id="credentialStore" 595 class="org.archive.modules.credential.CredentialStore"> 596 </bean> 597 --> 598 599 <!-- DISK SPACE MONITOR: 600 Pauses the crawl if disk space at monitored paths falls below minimum threshold --> 601 <!-- 602 <bean id="diskSpaceMonitor" class="org.archive.crawler.monitor.DiskSpaceMonitor"> 603 <property name="pauseThresholdMiB" value="500" /> 604 <property name="monitorConfigPaths" value="true" /> 605 <property name="monitorPaths"> 606 <list> 607 <value>PATH</value> 608 </list> 609 </property> 610 </bean> 611 --> 612 613 <!-- 614 REQUIRED STANDARD BEANS 615 It will be very rare to replace or reconfigure the following beans. 616 --> 617 618 <!-- STATISTICSTRACKER: standard stats/reporting collector --> 619 <bean id="statisticsTracker" 620 class="org.archive.crawler.reporting.StatisticsTracker" autowire="byName"> 621 <!-- <property name="reports"> 622 <list> 623 <bean id="crawlSummaryReport" class="org.archive.crawler.reporting.CrawlSummaryReport" /> 624 <bean id="seedsReport" class="org.archive.crawler.reporting.SeedsReport" /> 625 <bean id="hostsReport" class="org.archive.crawler.reporting.HostsReport" /> 626 <bean id="sourceTagsReport" class="org.archive.crawler.reporting.SourceTagsReport" /> 627 <bean id="mimetypesReport" class="org.archive.crawler.reporting.MimetypesReport" /> 628 <bean id="responseCodeReport" class="org.archive.crawler.reporting.ResponseCodeReport" /> 629 <bean id="processorsReport" class="org.archive.crawler.reporting.ProcessorsReport" /> 630 <bean id="frontierSummaryReport" class="org.archive.crawler.reporting.FrontierSummaryReport" /> 631 <bean id="frontierNonemptyReport" class="org.archive.crawler.reporting.FrontierNonemptyReport" /> 632 <bean id="toeThreadsReport" class="org.archive.crawler.reporting.ToeThreadsReport" /> 633 </list> 634 </property> --> 635 <!-- <property name="reportsDir" value="${launchId}/reports" /> --> 636 <!-- <property name="liveHostReportSize" value="20" /> --> 637 <!-- <property name="intervalSeconds" value="20" /> --> 638 <!-- <property name="keepSnapshotsCount" value="5" /> --> 639 <!-- <property name="liveHostReportSize" value="20" /> --> 640 </bean> 641 642 <!-- CRAWLERLOGGERMODULE: shared logging facility --> 643 <bean id="loggerModule" 644 class="org.archive.crawler.reporting.CrawlerLoggerModule"> 645 <!-- <property name="path" value="${launchId}/logs" /> --> 646 <!-- <property name="crawlLogPath" value="crawl.log" /> --> 647 <!-- <property name="alertsLogPath" value="alerts.log" /> --> 648 <!-- <property name="progressLogPath" value="progress-statistics.log" /> --> 649 <!-- <property name="uriErrorsLogPath" value="uri-errors.log" /> --> 650 <!-- <property name="runtimeErrorsLogPath" value="runtime-errors.log" /> --> 651 <!-- <property name="nonfatalErrorsLogPath" value="nonfatal-errors.log" /> --> 652 <!-- <property name="logExtraInfo" value="false" /> --> 653 </bean> 654 655 <!-- SHEETOVERLAYMANAGER: manager of sheets of contextual overlays 656 Autowired to include any SheetForSurtPrefix or 657 SheetForDecideRuled beans --> 658 <bean id="sheetOverlaysManager" autowire="byType" 659 class="org.archive.crawler.spring.SheetOverlaysManager"> 660 </bean> 661 662 <!-- BDBMODULE: shared BDB-JE disk persistence manager --> 663 <bean id="bdb" 664 class="org.archive.bdb.BdbModule"> 665 <!-- <property name="dir" value="state" /> --> 666 <!-- <property name="cachePercent" value="60" /> --> 667 <!-- <property name="useSharedCache" value="true" /> --> 668 <!-- <property name="expectedConcurrency" value="25" /> --> 669 </bean> 670 671 <!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP --> 672 <bean id="cookieStorage" 673 class="org.archive.modules.fetcher.BdbCookieStorage"> 674 <!-- <property name="cookiesLoadFile"><null/></property> --> 675 <!-- <property name="cookiesSaveFile"><null/></property> --> 676 <!-- <property name="bdb"> 677 <ref bean="bdb"/> 678 </property> --> 679 </bean> 680 681 <!-- SERVERCACHE: shared cache of server/host info --> 682 <bean id="serverCache" 683 class="org.archive.modules.net.BdbServerCache"> 684 <!-- <property name="bdb"> 685 <ref bean="bdb"/> 686 </property> --> 687 </bean> 688 689 <!-- CONFIG PATH CONFIGURER: required helper making crawl paths relative 690 to crawler-beans.cxml file, and tracking crawl files for web UI --> 691 <bean id="configPathConfigurer" 692 class="org.archive.spring.ConfigPathConfigurer"> 693 </bean> 694 695 </beans> 20 </xsl:stylesheet>
Note: See TracChangeset
for help on using the changeset viewer.