Changeset f25a924


Ignore:
Timestamp:
22/04/2012 16:27:16 (10 years ago)
Author:
Eric van der Vlist <vdv@dyomedea.com>
Branches:
master
Children:
3bcb813
Parents:
a3fa073
git-author:
Eric van der Vlist <vdv@dyomedea.com> (22/04/2012 16:27:16)
git-committer:
Eric van der Vlist <vdv@dyomedea.com> (22/04/2012 16:27:16)
Message:

Modifying the way the Heritrix (spring) config file is generated since it seems to be picky on whitespaces and indentation...

Location:
archiver/pipelines/actions
Files:
1 added
1 edited

Legend:

Unmodified
Added
Removed
  • archiver/pipelines/actions/cxml.xslt

    r57daa70 rf25a924  
    11<?xml version="1.0" encoding="UTF-8"?> 
    2 <!--  
    3   HERITRIX 3 CRAWL JOB CONFIGURATION FILE 
    4    
    5    This is a relatively minimal configuration suitable for many crawls. 
    6     
    7    Commented-out beans and properties are provided as an example; values 
    8    shown in comments reflect the actual defaults which are in effect 
    9    if not otherwise specified specification. (To change from the default  
    10    behavior, uncomment AND alter the shown values.)    
    11  --> 
    12 <beans xsl:version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" 
    13  xmlns="http://www.springframework.org/schema/beans" 
    14              xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
    15          xmlns:context="http://www.springframework.org/schema/context" 
    16              xmlns:aop="http://www.springframework.org/schema/aop" 
    17              xmlns:tx="http://www.springframework.org/schema/tx" 
    18              xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd 
    19            http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-3.0.xsd 
    20            http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.0.xsd 
    21            http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd"> 
    22   
    23  <context:annotation-config/> 
     2<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="2.0"> 
    243 
    25 <!--  
    26   OVERRIDES 
    27    Values elsewhere in the configuration may be replaced ('overridden')  
    28    by a Properties map declared in a PropertiesOverrideConfigurer,  
    29    using a dotted-bean-path to address individual bean properties.  
    30    This allows us to collect a few of the most-often changed values 
    31    in an easy-to-edit format here at the beginning of the model 
    32    configuration.     
    33  --> 
    34  <!-- overrides from a text property list --> 
    35  <bean id="simpleOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer"> 
    36   <property name="properties"> 
    37    <value> 
    38 # This Properties map is specified in the Java 'property list' text format 
    39 # http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29 
     4 <xsl:variable name="action" select="/action"/> 
    405 
    41 metadata.operatorContactUrl=http://owark.org 
    42 metadata.jobName=basic 
    43 metadata.description=Basic crawl starting with useful defaults 
     6 <xsl:template match="/"> 
     7  <xsl:apply-templates select="doc('crawler-beans-template.cxml')/*"/> 
     8 </xsl:template> 
    449 
    45 ##..more?..## 
    46    </value> 
    47   </property> 
    48  </bean> 
     10 <xsl:template match="@* | node()"> 
     11  <xsl:copy> 
     12   <xsl:apply-templates select="@* | node()"/> 
     13  </xsl:copy> 
     14 </xsl:template> 
    4915 
    50  <!-- overrides from declared <prop> elements, more easily allowing 
    51       multiline values or even declared beans --> 
    52  <bean id="longerOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer"> 
    53   <property name="properties"> 
    54    <props> 
    55     <prop key="seeds.textSource.value"> 
     16 <xsl:template match="url"> 
     17  <xsl:value-of select="$action/@url"/> 
     18 </xsl:template> 
    5619 
    57     <xsl:value-of select="/action/@url"/> 
    58  
    59     </prop> 
    60    </props> 
    61   </property> 
    62  </bean> 
    63  
    64  <!-- CRAWL METADATA: including identification of crawler/operator --> 
    65  <bean id="metadata" class="org.archive.modules.CrawlMetadata" autowire="byName"> 
    66        <property name="operatorContactUrl" value="[see override above]"/> 
    67        <property name="jobName" value="[see override above]"/> 
    68        <property name="description" value="[see override above]"/> 
    69   <!-- <property name="robotsPolicyName" value="obey"/> --> 
    70   <!-- <property name="operator" value=""/> --> 
    71   <!-- <property name="operatorFrom" value=""/> --> 
    72   <!-- <property name="organization" value=""/> --> 
    73   <!-- <property name="audience" value=""/> --> 
    74   <!-- <property name="userAgentTemplate"  
    75          value="Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)"/> --> 
    76         
    77  </bean> 
    78   
    79  <!-- SEEDS: crawl starting points  
    80       ConfigString allows simple, inline specification of a moderate 
    81       number of seeds; see below comment for example of using an 
    82       arbitrarily-large external file. --> 
    83  <bean id="seeds" class="org.archive.modules.seeds.TextSeedModule"> 
    84      <property name="textSource"> 
    85       <bean class="org.archive.spring.ConfigString"> 
    86        <property name="value"> 
    87         <value> 
    88 # [see override above] 
    89         </value> 
    90        </property> 
    91       </bean> 
    92      </property> 
    93 <!-- <property name='sourceTagSeeds' value='false'/> --> 
    94 <!-- <property name='blockAwaitingSeedLines' value='-1'/> --> 
    95  </bean> 
    96   
    97  <!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in 
    98       the job directory, similar to the H1 approach.  
    99       Use either the above, or this, but not both. --> 
    100  <!--  
    101  <bean id="seeds" class="org.archive.modules.seeds.TextSeedModule"> 
    102   <property name="textSource"> 
    103    <bean class="org.archive.spring.ConfigFile"> 
    104     <property name="path" value="seeds.txt" /> 
    105    </bean> 
    106   </property> 
    107   <property name='sourceTagSeeds' value='false'/> 
    108   <property name='blockAwaitingSeedLines' value='-1'/> 
    109  </bean> 
    110   --> 
    111   
    112  <!-- SCOPE: rules for which discovered URIs to crawl; order is very  
    113       important because last decision returned other than 'NONE' wins. --> 
    114  <bean id="scope" class="org.archive.modules.deciderules.DecideRuleSequence"> 
    115   <!-- <property name="logToFile" value="false" /> --> 
    116   <property name="rules"> 
    117    <list> 
    118     <!-- Begin by REJECTing all... --> 
    119     <bean class="org.archive.modules.deciderules.RejectDecideRule"> 
    120     </bean> 
    121     <!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... --> 
    122     <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> 
    123      <!-- <property name="seedsAsSurtPrefixes" value="true" /> --> 
    124      <!-- <property name="alsoCheckVia" value="false" /> --> 
    125      <!-- <property name="surtsSourceFile" value="" /> --> 
    126      <!-- <property name="surtsDumpFile" value="${launchId}/surts.dump" /> --> 
    127      <!-- <property name="surtsSource"> 
    128            <bean class="org.archive.spring.ConfigString"> 
    129             <property name="value"> 
    130              <value> 
    131               # example.com 
    132               # http://www.example.edu/path1/ 
    133               # +http://(org,example, 
    134              </value> 
    135             </property>  
    136            </bean> 
    137           </property> --> 
    138     </bean> 
    139     <!-- ...but REJECT those more than a configured link-hop-count from start... --> 
    140     <bean class="org.archive.modules.deciderules.TooManyHopsDecideRule"> 
    141       <property name="maxHops" value="0" />  
    142     </bean> 
    143     <!-- ...but ACCEPT those more than a configured link-hop-count from start... --> 
    144     <bean class="org.archive.modules.deciderules.TransclusionDecideRule"> 
    145      <!-- <property name="maxTransHops" value="2" /> --> 
    146      <!-- <property name="maxSpeculativeHops" value="1" /> --> 
    147     </bean> 
    148     <!-- ...but REJECT those from a configurable (initially empty) set of REJECT SURTs... --> 
    149     <bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> 
    150           <property name="decision" value="REJECT"/> 
    151           <property name="seedsAsSurtPrefixes" value="false"/> 
    152           <property name="surtsDumpFile" value="${launchId}/negative-surts.dump" />  
    153      <!-- <property name="surtsSource"> 
    154            <bean class="org.archive.spring.ConfigFile"> 
    155             <property name="path" value="negative-surts.txt" /> 
    156            </bean> 
    157           </property> --> 
    158     </bean> 
    159     <!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... --> 
    160     <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule"> 
    161           <property name="decision" value="REJECT"/> 
    162      <!-- <property name="listLogicalOr" value="true" /> --> 
    163      <!-- <property name="regexList"> 
    164            <list> 
    165            </list> 
    166           </property> --> 
    167     </bean> 
    168     <!-- ...and REJECT those with suspicious repeating path-segments... --> 
    169     <bean class="org.archive.modules.deciderules.PathologicalPathDecideRule"> 
    170      <!-- <property name="maxRepetitions" value="2" /> --> 
    171     </bean> 
    172     <!-- ...and REJECT those with more than threshold number of path-segments... --> 
    173     <bean class="org.archive.modules.deciderules.TooManyPathSegmentsDecideRule"> 
    174      <!-- <property name="maxPathDepth" value="20" /> --> 
    175     </bean> 
    176     <!-- ...but always ACCEPT those marked as prerequisitee for another URI... --> 
    177     <bean class="org.archive.modules.deciderules.PrerequisiteAcceptDecideRule"> 
    178     </bean> 
    179     <!-- ...but always REJECT those with unsupported URI schemes --> 
    180     <bean class="org.archive.modules.deciderules.SchemeNotInSetDecideRule"> 
    181     </bean> 
    182    </list> 
    183   </property> 
    184  </bean> 
    185   
    186  <!--  
    187    PROCESSING CHAINS 
    188     Much of the crawler's work is specified by the sequential  
    189     application of swappable Processor modules. These Processors 
    190     are collected into three 'chains'. The CandidateChain is applied  
    191     to URIs being considered for inclusion, before a URI is enqueued 
    192     for collection. The FetchChain is applied to URIs when their  
    193     turn for collection comes up. The DispositionChain is applied  
    194     after a URI is fetched and analyzed/link-extracted. 
    195   --> 
    196    
    197  <!-- CANDIDATE CHAIN -->  
    198  <!-- first, processors are declared as top-level named beans --> 
    199  <bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper"> 
    200  </bean> 
    201  <bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer"> 
    202   <!-- <property name="preferenceDepthHops" value="-1" /> --> 
    203   <!-- <property name="preferenceEmbedHops" value="1" /> --> 
    204   <!-- <property name="canonicalizationPolicy">  
    205         <ref bean="canonicalizationPolicy" /> 
    206        </property> --> 
    207   <!-- <property name="queueAssignmentPolicy">  
    208         <ref bean="queueAssignmentPolicy" /> 
    209        </property> --> 
    210   <!-- <property name="uriPrecedencePolicy">  
    211         <ref bean="uriPrecedencePolicy" /> 
    212        </property> --> 
    213   <!-- <property name="costAssignmentPolicy">  
    214         <ref bean="costAssignmentPolicy" /> 
    215        </property> --> 
    216  </bean> 
    217  <!-- now, processors are assembled into ordered CandidateChain bean --> 
    218  <bean id="candidateProcessors" class="org.archive.modules.CandidateChain"> 
    219   <property name="processors"> 
    220    <list> 
    221     <!-- apply scoping rules to each individual candidate URI... --> 
    222     <ref bean="candidateScoper"/> 
    223     <!-- ...then prepare those ACCEPTed to be enqueued to frontier. --> 
    224     <ref bean="preparer"/> 
    225    </list> 
    226   </property> 
    227  </bean> 
    228    
    229  <!-- FETCH CHAIN -->  
    230  <!-- first, processors are declared as top-level named beans --> 
    231  <bean id="preselector" class="org.archive.crawler.prefetch.Preselector"> 
    232   <!-- <property name="recheckScope" value="false" /> --> 
    233   <!-- <property name="blockAll" value="false" /> --> 
    234   <!-- <property name="blockByRegex" value="" /> --> 
    235   <!-- <property name="allowByRegex" value="" /> --> 
    236  </bean> 
    237  <bean id="preconditions" class="org.archive.crawler.prefetch.PreconditionEnforcer"> 
    238   <!-- <property name="ipValidityDurationSeconds" value="21600" /> --> 
    239   <!-- <property name="robotsValidityDurationSeconds" value="86400" /> --> 
    240   <!-- <property name="calculateRobotsOnly" value="false" /> --> 
    241  </bean> 
    242  <bean id="fetchDns" class="org.archive.modules.fetcher.FetchDNS"> 
    243   <!-- <property name="acceptNonDnsResolves" value="false" /> --> 
    244   <!-- <property name="digestContent" value="true" /> --> 
    245   <!-- <property name="digestAlgorithm" value="sha1" /> --> 
    246  </bean> 
    247  <!-- <bean id="fetchWhois" class="org.archive.modules.fetcher.FetchWhois"> 
    248        <property name="specialQueryTemplates"> 
    249         <map> 
    250          <entry key="whois.verisign-grs.com" value="domain %s" /> 
    251          <entry key="whois.arin.net" value="z + %s" /> 
    252          <entry key="whois.denic.de" value="-T dn %s" /> 
    253         </map> 
    254        </property>  
    255       </bean> --> 
    256  <bean id="fetchHttp" class="org.archive.modules.fetcher.FetchHTTP"> 
    257   <!-- <property name="useHTTP11" value="false" /> --> 
    258   <!-- <property name="maxLengthBytes" value="0" /> --> 
    259   <!-- <property name="timeoutSeconds" value="1200" /> --> 
    260   <!-- <property name="maxFetchKBSec" value="0" /> --> 
    261   <!-- <property name="defaultEncoding" value="ISO-8859-1" /> --> 
    262   <!-- <property name="shouldFetchBodyRule">  
    263         <bean class="org.archive.modules.deciderules.AcceptDecideRule"/> 
    264        </property> --> 
    265   <!-- <property name="soTimeoutMs" value="20000" /> --> 
    266   <!-- <property name="sendIfModifiedSince" value="true" /> --> 
    267   <!-- <property name="sendIfNoneMatch" value="true" /> --> 
    268   <!-- <property name="sendConnectionClose" value="true" /> --> 
    269   <!-- <property name="sendReferer" value="true" /> --> 
    270   <!-- <property name="sendRange" value="false" /> --> 
    271   <!-- <property name="ignoreCookies" value="false" /> --> 
    272   <!-- <property name="sslTrustLevel" value="OPEN" /> --> 
    273   <!-- <property name="acceptHeaders">  
    274         <list> 
    275          <value>Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value> 
    276         </list> 
    277        </property> 
    278   --> 
    279   <!-- <property name="httpBindAddress" value="" /> --> 
    280   <!-- <property name="httpProxyHost" value="" /> --> 
    281   <!-- <property name="httpProxyPort" value="0" /> --> 
    282   <!-- <property name="httpProxyUser" value="" /> --> 
    283   <!-- <property name="httpProxyPassword" value="" /> --> 
    284   <!-- <property name="digestContent" value="true" /> --> 
    285   <!-- <property name="digestAlgorithm" value="sha1" /> --> 
    286  </bean> 
    287  <bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP"> 
    288  </bean> 
    289  <bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML"> 
    290   <!-- <property name="extractJavascript" value="true" /> --> 
    291   <!-- <property name="extractValueAttributes" value="true" /> --> 
    292   <!-- <property name="ignoreFormActionUrls" value="false" /> --> 
    293   <!-- <property name="extractOnlyFormGets" value="true" /> --> 
    294   <!-- <property name="treatFramesAsEmbedLinks" value="true" /> --> 
    295   <!-- <property name="ignoreUnexpectedHtml" value="true" /> --> 
    296   <!-- <property name="maxElementLength" value="1024" /> --> 
    297   <!-- <property name="maxAttributeNameLength" value="1024" /> --> 
    298   <!-- <property name="maxAttributeValueLength" value="16384" /> --> 
    299  </bean> 
    300  <bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS"> 
    301  </bean>  
    302  <bean id="extractorJs" class="org.archive.modules.extractor.ExtractorJS"> 
    303  </bean> 
    304  <bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF"> 
    305  </bean>     
    306  <!-- now, processors are assembled into ordered FetchChain bean --> 
    307  <bean id="fetchProcessors" class="org.archive.modules.FetchChain"> 
    308   <property name="processors"> 
    309    <list> 
    310     <!-- re-check scope, if so enabled... --> 
    311     <ref bean="preselector"/> 
    312     <!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... --> 
    313     <ref bean="preconditions"/> 
    314     <!-- ...fetch if DNS URI... --> 
    315     <ref bean="fetchDns"/> 
    316     <!-- <ref bean="fetchWhois"/> --> 
    317     <!-- ...fetch if HTTP URI... --> 
    318     <ref bean="fetchHttp"/> 
    319     <!-- ...extract outlinks from HTTP headers... --> 
    320     <ref bean="extractorHttp"/> 
    321     <!-- ...extract outlinks from HTML content... --> 
    322     <ref bean="extractorHtml"/> 
    323     <!-- ...extract outlinks from CSS content... --> 
    324     <ref bean="extractorCss"/> 
    325     <!-- ...extract outlinks from Javascript content... --> 
    326     <ref bean="extractorJs"/> 
    327     <!-- ...extract outlinks from Flash content... --> 
    328     <ref bean="extractorSwf"/> 
    329    </list> 
    330   </property> 
    331  </bean> 
    332    
    333  <!-- DISPOSITION CHAIN --> 
    334  <!-- first, processors are declared as top-level named beans  --> 
    335  <bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor"> 
    336   <!-- <property name="compress" value="true" /> --> 
    337   <!-- <property name="prefix" value="IAH" /> --> 
    338   <!-- <property name="suffix" value="${HOSTNAME}" /> --> 
    339   <!-- <property name="maxFileSizeBytes" value="1000000000" /> --> 
    340   <!-- <property name="poolMaxActive" value="1" /> --> 
    341   <!-- <property name="MaxWaitForIdleMs" value="500" /> --> 
    342   <!-- <property name="skipIdenticalDigests" value="false" /> --> 
    343   <!-- <property name="maxTotalBytesToWrite" value="0" /> --> 
    344   <!-- <property name="directory" value="${launchId}" /> --> 
    345   <!-- <property name="storePaths"> 
    346         <list> 
    347          <value>warcs</value> 
    348         </list> 
    349        </property> --> 
    350   <!-- <property name="writeRequests" value="true" /> --> 
    351   <!-- <property name="writeMetadata" value="true" /> --> 
    352   <!-- <property name="writeRevisitForIdenticalDigests" value="true" /> --> 
    353   <!-- <property name="writeRevisitForNotModified" value="true" /> --> 
    354  </bean> 
    355  <bean id="candidates" class="org.archive.crawler.postprocessor.CandidatesProcessor"> 
    356   <!-- <property name="seedsRedirectNewSeeds" value="true" /> --> 
    357  </bean> 
    358  <bean id="disposition" class="org.archive.crawler.postprocessor.DispositionProcessor"> 
    359   <!-- <property name="delayFactor" value="5.0" /> --> 
    360   <!-- <property name="minDelayMs" value="3000" /> --> 
    361   <!-- <property name="respectCrawlDelayUpToSeconds" value="300" /> --> 
    362   <!-- <property name="maxDelayMs" value="30000" /> --> 
    363   <!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> --> 
    364  </bean> 
    365  <!-- <bean id="rescheduler" class="org.archive.crawler.postprocessor.ReschedulingProcessor"> 
    366        <property name="rescheduleDelaySeconds" value="-1" /> 
    367       </bean> --> 
    368  <!-- now, processors are assembled into ordered DispositionChain bean --> 
    369  <bean id="dispositionProcessors" class="org.archive.modules.DispositionChain"> 
    370   <property name="processors"> 
    371    <list> 
    372     <!-- write to aggregate archival files... --> 
    373     <ref bean="warcWriter"/> 
    374     <!-- ...send each outlink candidate URI to CandidateChain,  
    375          and enqueue those ACCEPTed to the frontier... --> 
    376     <ref bean="candidates"/> 
    377     <!-- ...then update stats, shared-structures, frontier decisions --> 
    378     <ref bean="disposition"/> 
    379     <!-- <ref bean="rescheduler" /> --> 
    380    </list> 
    381   </property> 
    382  </bean> 
    383   
    384  <!-- CRAWLCONTROLLER: Control interface, unifying context --> 
    385  <bean id="crawlController"  
    386    class="org.archive.crawler.framework.CrawlController"> 
    387   <!-- <property name="maxToeThreads" value="25" /> --> 
    388   <!-- <property name="pauseAtStart" value="true" /> --> 
    389   <!-- <property name="runWhileEmpty" value="false" /> --> 
    390   <!-- <property name="recorderInBufferBytes" value="524288" /> --> 
    391   <!-- <property name="recorderOutBufferBytes" value="16384" /> --> 
    392   <!-- <property name="scratchDir" value="scratch" /> --> 
    393  </bean> 
    394   
    395  <!-- FRONTIER: Record of all URIs discovered and queued-for-collection --> 
    396  <bean id="frontier"  
    397    class="org.archive.crawler.frontier.BdbFrontier"> 
    398   <!-- <property name="queueTotalBudget" value="-1" /> --> 
    399   <!-- <property name="balanceReplenishAmount" value="3000" /> --> 
    400   <!-- <property name="errorPenaltyAmount" value="100" /> --> 
    401   <!-- <property name="precedenceFloor" value="255" /> --> 
    402   <!-- <property name="queuePrecedencePolicy"> 
    403         <bean class="org.archive.crawler.frontier.precedence.BaseQueuePrecedencePolicy" /> 
    404        </property> --> 
    405   <!-- <property name="snoozeLongMs" value="300000" /> --> 
    406   <!-- <property name="retryDelaySeconds" value="900" /> --> 
    407   <!-- <property name="maxRetries" value="30" /> --> 
    408   <!-- <property name="recoveryLogEnabled" value="true" /> --> 
    409   <!-- <property name="maxOutlinks" value="6000" /> --> 
    410   <!-- <property name="extractIndependently" value="false" /> --> 
    411   <!-- <property name="outbound"> 
    412         <bean class="java.util.concurrent.ArrayBlockingQueue"> 
    413          <constructor-arg value="200"/> 
    414          <constructor-arg value="true"/> 
    415         </bean> 
    416        </property> --> 
    417   <!-- <property name="inbound"> 
    418         <bean class="java.util.concurrent.ArrayBlockingQueue"> 
    419          <constructor-arg value="40000"/> 
    420          <constructor-arg value="true"/> 
    421         </bean> 
    422        </property> --> 
    423   <!-- <property name="dumpPendingAtClose" value="false" /> --> 
    424  </bean> 
    425   
    426  <!-- URI UNIQ FILTER: Used by frontier to remember already-included URIs -->  
    427  <bean id="uriUniqFilter"  
    428    class="org.archive.crawler.util.BdbUriUniqFilter"> 
    429  </bean> 
    430   
    431  <!-- 
    432    EXAMPLE SETTINGS OVERLAY SHEETS 
    433    Sheets allow some settings to vary by context - usually by URI context, 
    434    so that different sites or sections of sites can be treated differently.  
    435    Here are some example Sheets for common purposes. The SheetOverlaysManager 
    436    (below) automatically collects all Sheet instances declared among the  
    437    original beans, but others can be added during the crawl via the scripting  
    438    interface. 
    439   --> 
    440  
    441 <!-- forceRetire: any URI to which this sheet's settings are applied  
    442      will force its containing queue to 'retired' status. --> 
    443 <bean id='forceRetire' class='org.archive.spring.Sheet'> 
    444  <property name='map'> 
    445   <map> 
    446    <entry key='disposition.forceRetire' value='true'/> 
    447   </map> 
    448  </property> 
    449 </bean> 
    450  
    451 <!-- smallBudget: any URI to which this sheet's settings are applied  
    452      will give its containing queue small values for balanceReplenishAmount  
    453      (causing it to have shorter 'active' periods while other queues are  
    454      waiting) and queueTotalBudget (causing the queue to enter 'retired'  
    455      status once that expenditure is reached by URI attempts and errors) --> 
    456 <bean id='smallBudget' class='org.archive.spring.Sheet'> 
    457  <property name='map'> 
    458   <map> 
    459    <entry key='frontier.balanceReplenishAmount' value='20'/> 
    460    <entry key='frontier.queueTotalBudget' value='100'/> 
    461   </map> 
    462  </property> 
    463 </bean> 
    464  
    465 <!-- veryPolite: any URI to which this sheet's settings are applied  
    466      will cause its queue to take extra-long politeness snoozes --> 
    467 <bean id='veryPolite' class='org.archive.spring.Sheet'> 
    468  <property name='map'> 
    469   <map> 
    470    <entry key='disposition.delayFactor' value='10'/> 
    471    <entry key='disposition.minDelayMs' value='10000'/> 
    472    <entry key='disposition.maxDelayMs' value='1000000'/> 
    473    <entry key='disposition.respectCrawlDelayUpToSeconds' value='3600'/> 
    474   </map> 
    475  </property> 
    476 </bean> 
    477  
    478 <!-- highPrecedence: any URI to which this sheet's settings are applied  
    479      will give its containing queue a slightly-higher than default  
    480      queue precedence value. That queue will then be preferred over  
    481      other queues for active crawling, never waiting behind lower- 
    482      precedence queues. --> 
    483 <bean id='highPrecedence' class='org.archive.spring.Sheet'> 
    484  <property name='map'> 
    485   <map> 
    486    <entry key='frontier.balanceReplenishAmount' value='20'/> 
    487    <entry key='frontier.queueTotalBudget' value='100'/> 
    488   </map> 
    489  </property> 
    490 </bean> 
    491  
    492 <!-- 
    493    EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION 
    494    A SheetAssociation says certain URIs should have certain overlay Sheets 
    495    applied. This example applies two sheets to URIs matching two SURT-prefixes. 
    496    New associations may also be added mid-crawl using the scripting facility. 
    497   --> 
    498  
    499 <!-- 
    500 <bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'> 
    501  <property name='surtPrefixes'> 
    502   <list> 
    503    <value>http://(org,example,</value> 
    504    <value>http://(com,example,www,)/</value> 
    505   </list> 
    506  </property> 
    507  <property name='targetSheetNames'> 
    508   <list> 
    509    <value>veryPolite</value> 
    510    <value>smallBudget</value> 
    511   </list> 
    512  </property> 
    513 </bean> 
    514 --> 
    515  
    516  <!--  
    517    OPTIONAL BUT RECOMMENDED BEANS 
    518   --> 
    519    
    520  <!-- ACTIONDIRECTORY: disk directory for mid-crawl operations 
    521       Running job will watch directory for new files with URIs,  
    522       scripts, and other data to be processed during a crawl. --> 
    523  <bean id="actionDirectory" class="org.archive.crawler.framework.ActionDirectory"> 
    524   <!-- <property name="actionDir" value="action" /> --> 
    525   <!-- <property name="doneDir" value="${launchId}/actions-done" /> --> 
    526   <!-- <property name="initialDelaySeconds" value="10" /> --> 
    527   <!-- <property name="delaySeconds" value="30" /> --> 
    528  </bean>  
    529   
    530  <!--  CRAWLLIMITENFORCER: stops crawl when it reaches configured limits --> 
    531  <bean id="crawlLimiter" class="org.archive.crawler.framework.CrawlLimitEnforcer"> 
    532   <!-- <property name="maxBytesDownload" value="0" /> --> 
    533   <!-- <property name="maxDocumentsDownload" value="0" /> --> 
    534   <!-- <property name="maxTimeSeconds" value="0" /> --> 
    535  </bean> 
    536   
    537  <!-- CHECKPOINTSERVICE: checkpointing assistance --> 
    538  <bean id="checkpointService"  
    539    class="org.archive.crawler.framework.CheckpointService"> 
    540   <!-- <property name="checkpointIntervalMinutes" value="-1"/> --> 
    541   <!-- <property name="checkpointsDir" value="checkpoints"/> --> 
    542  </bean> 
    543   
    544  <!--  
    545    OPTIONAL BEANS 
    546     Uncomment and expand as needed, or if non-default alternate  
    547     implementations are preferred. 
    548   --> 
    549    
    550  <!-- CANONICALIZATION POLICY --> 
    551  <!-- 
    552  <bean id="canonicalizationPolicy"  
    553    class="org.archive.modules.canonicalize.RulesCanonicalizationPolicy"> 
    554    <property name="rules"> 
    555     <list> 
    556      <bean class="org.archive.modules.canonicalize.LowercaseRule" /> 
    557      <bean class="org.archive.modules.canonicalize.StripUserinfoRule" /> 
    558      <bean class="org.archive.modules.canonicalize.StripWWWNRule" /> 
    559      <bean class="org.archive.modules.canonicalize.StripSessionIDs" /> 
    560      <bean class="org.archive.modules.canonicalize.StripSessionCFIDs" /> 
    561      <bean class="org.archive.modules.canonicalize.FixupQueryString" /> 
    562     </list> 
    563   </property> 
    564  </bean> 
    565  --> 
    566   
    567  
    568  <!-- QUEUE ASSIGNMENT POLICY --> 
    569  <!-- 
    570  <bean id="queueAssignmentPolicy"  
    571    class="org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy"> 
    572   <property name="forceQueueAssignment" value="" /> 
    573   <property name="deferToPrevious" value="true" /> 
    574   <property name="parallelQueues" value="1" /> 
    575  </bean> 
    576  --> 
    577   
    578  <!-- URI PRECEDENCE POLICY --> 
    579  <!-- 
    580  <bean id="uriPrecedencePolicy"  
    581    class="org.archive.crawler.frontier.precedence.CostUriPrecedencePolicy"> 
    582  </bean> 
    583  --> 
    584   
    585  <!-- COST ASSIGNMENT POLICY --> 
    586  <!-- 
    587  <bean id="costAssignmentPolicy"  
    588    class="org.archive.crawler.frontier.UnitCostAssignmentPolicy"> 
    589  </bean> 
    590  --> 
    591   
    592  <!-- CREDENTIAL STORE: HTTP authentication or FORM POST credentials --> 
    593  <!--  
    594  <bean id="credentialStore"  
    595    class="org.archive.modules.credential.CredentialStore"> 
    596  </bean> 
    597  --> 
    598   
    599  <!-- DISK SPACE MONITOR:  
    600       Pauses the crawl if disk space at monitored paths falls below minimum threshold --> 
    601  <!--  
    602  <bean id="diskSpaceMonitor" class="org.archive.crawler.monitor.DiskSpaceMonitor"> 
    603    <property name="pauseThresholdMiB" value="500" /> 
    604    <property name="monitorConfigPaths" value="true" /> 
    605    <property name="monitorPaths"> 
    606      <list> 
    607        <value>PATH</value> 
    608      </list> 
    609    </property> 
    610  </bean> 
    611  --> 
    612   
    613  <!--  
    614    REQUIRED STANDARD BEANS 
    615     It will be very rare to replace or reconfigure the following beans. 
    616   --> 
    617  
    618  <!-- STATISTICSTRACKER: standard stats/reporting collector --> 
    619  <bean id="statisticsTracker"  
    620    class="org.archive.crawler.reporting.StatisticsTracker" autowire="byName"> 
    621   <!-- <property name="reports"> 
    622         <list> 
    623          <bean id="crawlSummaryReport" class="org.archive.crawler.reporting.CrawlSummaryReport" /> 
    624          <bean id="seedsReport" class="org.archive.crawler.reporting.SeedsReport" /> 
    625          <bean id="hostsReport" class="org.archive.crawler.reporting.HostsReport" /> 
    626          <bean id="sourceTagsReport" class="org.archive.crawler.reporting.SourceTagsReport" /> 
    627          <bean id="mimetypesReport" class="org.archive.crawler.reporting.MimetypesReport" /> 
    628          <bean id="responseCodeReport" class="org.archive.crawler.reporting.ResponseCodeReport" /> 
    629          <bean id="processorsReport" class="org.archive.crawler.reporting.ProcessorsReport" /> 
    630          <bean id="frontierSummaryReport" class="org.archive.crawler.reporting.FrontierSummaryReport" /> 
    631          <bean id="frontierNonemptyReport" class="org.archive.crawler.reporting.FrontierNonemptyReport" /> 
    632          <bean id="toeThreadsReport" class="org.archive.crawler.reporting.ToeThreadsReport" /> 
    633         </list> 
    634        </property> --> 
    635   <!-- <property name="reportsDir" value="${launchId}/reports" /> --> 
    636   <!-- <property name="liveHostReportSize" value="20" /> --> 
    637   <!-- <property name="intervalSeconds" value="20" /> --> 
    638   <!-- <property name="keepSnapshotsCount" value="5" /> --> 
    639   <!-- <property name="liveHostReportSize" value="20" /> --> 
    640  </bean> 
    641   
    642  <!-- CRAWLERLOGGERMODULE: shared logging facility --> 
    643  <bean id="loggerModule"  
    644    class="org.archive.crawler.reporting.CrawlerLoggerModule"> 
    645   <!-- <property name="path" value="${launchId}/logs" /> --> 
    646   <!-- <property name="crawlLogPath" value="crawl.log" /> --> 
    647   <!-- <property name="alertsLogPath" value="alerts.log" /> --> 
    648   <!-- <property name="progressLogPath" value="progress-statistics.log" /> --> 
    649   <!-- <property name="uriErrorsLogPath" value="uri-errors.log" /> --> 
    650   <!-- <property name="runtimeErrorsLogPath" value="runtime-errors.log" /> --> 
    651   <!-- <property name="nonfatalErrorsLogPath" value="nonfatal-errors.log" /> --> 
    652   <!-- <property name="logExtraInfo" value="false" /> --> 
    653  </bean> 
    654   
    655  <!-- SHEETOVERLAYMANAGER: manager of sheets of contextual overlays 
    656       Autowired to include any SheetForSurtPrefix or  
    657       SheetForDecideRuled beans --> 
    658  <bean id="sheetOverlaysManager" autowire="byType" 
    659    class="org.archive.crawler.spring.SheetOverlaysManager"> 
    660  </bean> 
    661  
    662  <!-- BDBMODULE: shared BDB-JE disk persistence manager --> 
    663  <bean id="bdb"  
    664   class="org.archive.bdb.BdbModule"> 
    665   <!-- <property name="dir" value="state" /> --> 
    666   <!-- <property name="cachePercent" value="60" /> --> 
    667   <!-- <property name="useSharedCache" value="true" /> --> 
    668   <!-- <property name="expectedConcurrency" value="25" /> --> 
    669  </bean> 
    670   
    671  <!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP --> 
    672  <bean id="cookieStorage"  
    673    class="org.archive.modules.fetcher.BdbCookieStorage"> 
    674   <!-- <property name="cookiesLoadFile"><null/></property> --> 
    675   <!-- <property name="cookiesSaveFile"><null/></property> --> 
    676   <!-- <property name="bdb"> 
    677         <ref bean="bdb"/> 
    678        </property> --> 
    679  </bean> 
    680   
    681  <!-- SERVERCACHE: shared cache of server/host info --> 
    682  <bean id="serverCache"  
    683    class="org.archive.modules.net.BdbServerCache"> 
    684   <!-- <property name="bdb"> 
    685         <ref bean="bdb"/> 
    686        </property> --> 
    687  </bean> 
    688  
    689  <!-- CONFIG PATH CONFIGURER: required helper making crawl paths relative 
    690       to crawler-beans.cxml file, and tracking crawl files for web UI --> 
    691  <bean id="configPathConfigurer"  
    692    class="org.archive.spring.ConfigPathConfigurer"> 
    693  </bean> 
    694   
    695 </beans> 
     20</xsl:stylesheet> 
Note: See TracChangeset for help on using the changeset viewer.